diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..62e965b0a44a6869ae1ca6308bd62ba8aa3d6500 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "gate_proj", + "down_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..094c5f455bf35a3701037f8cf928766bda4e2fdf --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6471582fd50057e84dfb08f495660314c24a16a94f8ac9426da1edcd17b1603 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..4fdf7989abc46c9a30ad7289defc684eaadae19e --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:582698d24359f6be8e4df167566915dd311ba98e0ec3ef337f6141b3712b3e53 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..69c3e79ecc4618eceb3a7a2c6df65a1eb4bea614 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 0.9719731560555253, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4225, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 1.0156967480828982, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3212, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.9238233668481546, + "learning_rate": 3.157894736842105e-05, + "loss": 1.4215, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7973955034809719, + "learning_rate": 4.210526315789474e-05, + "loss": 1.3138, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.6758347176205124, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0945, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.971727217409248, + "learning_rate": 6.31578947368421e-05, + "loss": 1.31, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7867392045118493, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1325, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.1691873896204046, + "learning_rate": 8.421052631578948e-05, + "loss": 1.1948, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 1.1988387781152483, + "learning_rate": 9.473684210526316e-05, + "loss": 1.1633, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.9327563094331902, + "learning_rate": 0.00010526315789473685, + "loss": 1.1291, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.8979794621884146, + "learning_rate": 0.00011578947368421053, + "loss": 1.0959, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6473368763743381, + "learning_rate": 0.0001263157894736842, + "loss": 1.0195, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5888234879398717, + "learning_rate": 0.0001368421052631579, + "loss": 1.0351, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6835992554214257, + "learning_rate": 0.00014736842105263158, + "loss": 0.916, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.6155398013768341, + "learning_rate": 0.00015789473684210527, + "loss": 0.9745, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6424060942035926, + "learning_rate": 0.00016842105263157895, + "loss": 0.9484, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.532550668054331, + "learning_rate": 0.00017894736842105264, + "loss": 0.9576, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6359298776562393, + "learning_rate": 0.00018947368421052632, + "loss": 0.9781, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.7033509923785414, + "learning_rate": 0.0002, + "loss": 1.0213, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.6136991168036158, + "learning_rate": 0.00019999865623437013, + "loss": 0.8679, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5683787919304325, + "learning_rate": 0.00019999462497359466, + "loss": 0.8792, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6975898261539225, + "learning_rate": 0.00019998790632601496, + "loss": 1.122, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.625225000612276, + "learning_rate": 0.0001999785004721968, + "loss": 1.071, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5842836648495332, + "learning_rate": 0.00019996640766492543, + "loss": 1.0011, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.6425014712125224, + "learning_rate": 0.00019995162822919883, + "loss": 1.0429, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5766587264596573, + "learning_rate": 0.00019993416256221895, + "loss": 0.9554, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5155142424732911, + "learning_rate": 0.00019991401113338104, + "loss": 0.8836, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.555621828495599, + "learning_rate": 0.00019989117448426108, + "loss": 0.8526, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.692065965650676, + "learning_rate": 0.00019986565322860115, + "loss": 1.0003, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.6149606129626048, + "learning_rate": 0.00019983744805229296, + "loss": 1.0233, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.580044488482369, + "learning_rate": 0.00019980655971335945, + "loss": 0.944, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5745893317976081, + "learning_rate": 0.00019977298904193437, + "loss": 0.8352, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4949387938042247, + "learning_rate": 0.00019973673694024, + "loss": 0.8841, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5776965982580251, + "learning_rate": 0.00019969780438256293, + "loss": 1.083, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.5236266882773968, + "learning_rate": 0.0001996561924152278, + "loss": 0.9272, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.724621871905579, + "learning_rate": 0.0001996119021565693, + "loss": 0.9061, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.57728187410348, + "learning_rate": 0.0001995649347969019, + "loss": 0.9413, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5937644360791755, + "learning_rate": 0.00019951529159848805, + "loss": 0.8739, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.5885045750020044, + "learning_rate": 0.00019946297389550433, + "loss": 0.9292, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.5375604144078661, + "learning_rate": 0.00019940798309400526, + "loss": 0.9803, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5224367418365979, + "learning_rate": 0.0001993503206718859, + "loss": 0.812, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5626009709995728, + "learning_rate": 0.00019928998817884182, + "loss": 0.8799, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.502499897060231, + "learning_rate": 0.00019922698723632767, + "loss": 0.9542, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5909498723276309, + "learning_rate": 0.00019916131953751342, + "loss": 0.9373, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.48624540596138904, + "learning_rate": 0.00019909298684723904, + "loss": 0.886, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.7926471063751397, + "learning_rate": 0.00019902199100196697, + "loss": 1.0794, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.5005786861152867, + "learning_rate": 0.00019894833390973266, + "loss": 0.9054, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5011911229465077, + "learning_rate": 0.00019887201755009357, + "loss": 0.9255, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.49153908455646683, + "learning_rate": 0.0001987930439740757, + "loss": 0.9401, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.7739765898833159, + "learning_rate": 0.00019871141530411853, + "loss": 0.9633, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5981328278567677, + "learning_rate": 0.0001986271337340182, + "loss": 0.9323, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6930023972779622, + "learning_rate": 0.00019854020152886814, + "loss": 1.0079, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5086530254159303, + "learning_rate": 0.0001984506210249986, + "loss": 0.8875, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5833312107461119, + "learning_rate": 0.00019835839462991361, + "loss": 0.8608, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.60348201124767, + "learning_rate": 0.00019826352482222638, + "loss": 0.9313, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.48903426313247045, + "learning_rate": 0.00019816601415159263, + "loss": 0.9042, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4897146593936715, + "learning_rate": 0.0001980658652386421, + "loss": 0.8525, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.5551167051993521, + "learning_rate": 0.00019796308077490817, + "loss": 0.9554, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.5561672558240812, + "learning_rate": 0.00019785766352275542, + "loss": 0.8759, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.5074154141993045, + "learning_rate": 0.00019774961631530545, + "loss": 0.882, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.6574299113732572, + "learning_rate": 0.00019763894205636072, + "loss": 1.0118, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.6004577635455566, + "learning_rate": 0.00019752564372032657, + "loss": 0.882, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.646991913244477, + "learning_rate": 0.00019740972435213115, + "loss": 0.9729, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5377906520286383, + "learning_rate": 0.00019729118706714375, + "loss": 0.9647, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.5403821486665964, + "learning_rate": 0.00019717003505109095, + "loss": 0.8416, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5958932417106001, + "learning_rate": 0.00019704627155997108, + "loss": 0.9724, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5380838249700484, + "learning_rate": 0.00019691989991996663, + "loss": 0.9369, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5767496831021328, + "learning_rate": 0.0001967909235273549, + "loss": 0.9476, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5025417082598497, + "learning_rate": 0.00019665934584841682, + "loss": 0.8396, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.5741017467302606, + "learning_rate": 0.00019652517041934356, + "loss": 0.8811, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5606179041892586, + "learning_rate": 0.00019638840084614182, + "loss": 0.8185, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5649379648112125, + "learning_rate": 0.00019624904080453655, + "loss": 0.8958, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.46302038221209, + "learning_rate": 0.00019610709403987246, + "loss": 0.785, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5873484897971788, + "learning_rate": 0.00019596256436701324, + "loss": 0.976, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.5372137888546485, + "learning_rate": 0.000195815455670239, + "loss": 0.85, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5283284433789608, + "learning_rate": 0.00019566577190314197, + "loss": 0.958, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.527865127960602, + "learning_rate": 0.0001955135170885202, + "loss": 0.8401, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5918748384738277, + "learning_rate": 0.00019535869531826937, + "loss": 0.8535, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5504783750584027, + "learning_rate": 0.00019520131075327298, + "loss": 0.8036, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.6339201701822237, + "learning_rate": 0.00019504136762329047, + "loss": 0.9951, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.6336655475736347, + "learning_rate": 0.00019487887022684336, + "loss": 1.0156, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.54718693044338, + "learning_rate": 0.00019471382293110003, + "loss": 0.8811, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5974586650433192, + "learning_rate": 0.00019454623017175812, + "loss": 0.8694, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.48432957860470743, + "learning_rate": 0.00019437609645292546, + "loss": 0.8565, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.5501452529707288, + "learning_rate": 0.0001942034263469989, + "loss": 0.9211, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.581601886863996, + "learning_rate": 0.00019402822449454153, + "loss": 0.944, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5453290568380136, + "learning_rate": 0.00019385049560415794, + "loss": 0.9025, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5137363846987513, + "learning_rate": 0.00019367024445236754, + "loss": 0.8192, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.43975358822950616, + "learning_rate": 0.00019348747588347637, + "loss": 0.7649, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.6477805482411492, + "learning_rate": 0.00019330219480944694, + "loss": 0.8912, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.5549996594907376, + "learning_rate": 0.00019311440620976597, + "loss": 0.9085, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5729456916395933, + "learning_rate": 0.0001929241151313108, + "loss": 0.8908, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.48159418626487066, + "learning_rate": 0.00019273132668821364, + "loss": 0.7758, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5822518035352405, + "learning_rate": 0.00019253604606172417, + "loss": 0.8969, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.5243033839009505, + "learning_rate": 0.00019233827850007027, + "loss": 0.8717, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5346613252480791, + "learning_rate": 0.00019213802931831696, + "loss": 0.8805, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.6011773508431371, + "learning_rate": 0.00019193530389822363, + "loss": 0.8725, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4346343771047141, + "learning_rate": 0.00019173010768809933, + "loss": 0.8198, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.5150804851014471, + "learning_rate": 0.0001915224462026563, + "loss": 0.9334, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.7443920020926502, + "learning_rate": 0.00019131232502286188, + "loss": 0.8586, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.6326828101232339, + "learning_rate": 0.0001910997497957885, + "loss": 0.9827, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4786775007935448, + "learning_rate": 0.00019088472623446183, + "loss": 0.8316, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.7659943463209778, + "learning_rate": 0.00019066726011770726, + "loss": 0.9851, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5138606340061297, + "learning_rate": 0.0001904473572899947, + "loss": 0.8502, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.6883167701504389, + "learning_rate": 0.00019022502366128135, + "loss": 0.9245, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5710717799313989, + "learning_rate": 0.00019000026520685302, + "loss": 0.8528, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.500567838603159, + "learning_rate": 0.0001897730879671634, + "loss": 0.8482, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5275356495515479, + "learning_rate": 0.00018954349804767184, + "loss": 0.9279, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.42710809202511696, + "learning_rate": 0.00018931150161867916, + "loss": 0.7936, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.6248144346372322, + "learning_rate": 0.00018907710491516199, + "loss": 0.7894, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.5274604228677718, + "learning_rate": 0.0001888403142366049, + "loss": 0.9088, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5020540127916125, + "learning_rate": 0.00018860113594683148, + "loss": 0.8505, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.45423127056696233, + "learning_rate": 0.00018835957647383303, + "loss": 0.7561, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.646411592003773, + "learning_rate": 0.00018811564230959588, + "loss": 0.9523, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.6011236294592678, + "learning_rate": 0.00018786934000992688, + "loss": 0.8715, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.6231809150699286, + "learning_rate": 0.00018762067619427746, + "loss": 0.889, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5530901333727157, + "learning_rate": 0.00018736965754556528, + "loss": 0.8234, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5922878804590594, + "learning_rate": 0.00018711629080999504, + "loss": 0.9207, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.47832642676613, + "learning_rate": 0.00018686058279687698, + "loss": 0.8734, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.5546011631385598, + "learning_rate": 0.00018660254037844388, + "loss": 0.8495, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.6641640855239751, + "learning_rate": 0.00018634217048966637, + "loss": 1.0706, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5164084220861261, + "learning_rate": 0.0001860794801280666, + "loss": 0.8893, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.6047446965306861, + "learning_rate": 0.0001858144763535302, + "loss": 1.0178, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.5902333542356276, + "learning_rate": 0.0001855471662881164, + "loss": 0.8912, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.5105567603891245, + "learning_rate": 0.00018527755711586678, + "loss": 0.827, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.45849702835193873, + "learning_rate": 0.00018500565608261214, + "loss": 0.8058, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.6350828160949975, + "learning_rate": 0.00018473147049577774, + "loss": 1.0222, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5953872318050398, + "learning_rate": 0.00018445500772418697, + "loss": 0.9346, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.460052987155817, + "learning_rate": 0.00018417627519786315, + "loss": 0.7464, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.5563549169254246, + "learning_rate": 0.00018389528040783012, + "loss": 0.9149, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5503538093328367, + "learning_rate": 0.00018361203090591071, + "loss": 0.9234, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5528895401914546, + "learning_rate": 0.00018332653430452376, + "loss": 0.9652, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.517006820603383, + "learning_rate": 0.00018303879827647975, + "loss": 0.8664, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.6855622536343549, + "learning_rate": 0.00018274883055477436, + "loss": 1.0355, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.5230125244800105, + "learning_rate": 0.00018245663893238075, + "loss": 0.9216, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5450231518355951, + "learning_rate": 0.00018216223126204007, + "loss": 0.7938, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.5292627538308421, + "learning_rate": 0.00018186561545605054, + "loss": 0.8114, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4524328604491942, + "learning_rate": 0.00018156679948605467, + "loss": 0.8356, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.5826249323016475, + "learning_rate": 0.00018126579138282503, + "loss": 0.9046, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.5178297827336125, + "learning_rate": 0.0001809625992360485, + "loss": 0.8889, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5172877366154405, + "learning_rate": 0.00018065723119410884, + "loss": 0.866, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.8026104835258362, + "learning_rate": 0.00018034969546386757, + "loss": 1.1493, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.6496777951298773, + "learning_rate": 0.0001800400003104436, + "loss": 0.8841, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5320786252973128, + "learning_rate": 0.00017972815405699103, + "loss": 0.8307, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.4383389820784073, + "learning_rate": 0.00017941416508447536, + "loss": 0.78, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.6606834795469824, + "learning_rate": 0.0001790980418314484, + "loss": 0.9924, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.42540856147138034, + "learning_rate": 0.00017877979279382135, + "loss": 0.7706, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5845185060302059, + "learning_rate": 0.0001784594265246366, + "loss": 0.7887, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.5721599134367784, + "learning_rate": 0.0001781369516338378, + "loss": 0.8799, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.5268683281377597, + "learning_rate": 0.00017781237678803847, + "loss": 0.8091, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.43845575615203913, + "learning_rate": 0.000177485710710289, + "loss": 0.6761, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.43555546938609113, + "learning_rate": 0.00017715696217984235, + "loss": 0.8649, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.536111821484135, + "learning_rate": 0.00017682614003191807, + "loss": 0.8726, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5347321227840028, + "learning_rate": 0.00017649325315746478, + "loss": 0.8387, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.45982293352296677, + "learning_rate": 0.0001761583105029213, + "loss": 0.7505, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.49801018069699443, + "learning_rate": 0.00017582132106997616, + "loss": 0.9193, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.5387030949788503, + "learning_rate": 0.00017548229391532572, + "loss": 0.7959, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.48118102523874073, + "learning_rate": 0.00017514123815043074, + "loss": 0.8368, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.5337695716325006, + "learning_rate": 0.00017479816294127152, + "loss": 0.8524, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.49259858479074997, + "learning_rate": 0.0001744530775081015, + "loss": 0.8575, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5416874456814282, + "learning_rate": 0.0001741059911251997, + "loss": 0.9517, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.6131632969712694, + "learning_rate": 0.000173756913120621, + "loss": 0.921, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5161966804309467, + "learning_rate": 0.00017340585287594604, + "loss": 0.8919, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5670918355550684, + "learning_rate": 0.0001730528198260285, + "loss": 0.8829, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.5174438543801007, + "learning_rate": 0.00017269782345874203, + "loss": 0.8312, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5906430371847827, + "learning_rate": 0.00017234087331472497, + "loss": 0.9143, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.5551297460220246, + "learning_rate": 0.00017198197898712404, + "loss": 0.8797, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5511750615423108, + "learning_rate": 0.00017162115012133643, + "loss": 0.8409, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.536473320667922, + "learning_rate": 0.00017125839641475072, + "loss": 0.802, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.49954805051121204, + "learning_rate": 0.00017089372761648616, + "loss": 0.8827, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.6099699365320334, + "learning_rate": 0.00017052715352713075, + "loss": 0.9626, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5299390503146365, + "learning_rate": 0.00017015868399847768, + "loss": 0.9175, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.5628722071736884, + "learning_rate": 0.00016978832893326074, + "loss": 0.7777, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4729580792570179, + "learning_rate": 0.00016941609828488807, + "loss": 0.8673, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.5930437779821152, + "learning_rate": 0.0001690420020571747, + "loss": 0.8206, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4292997621821196, + "learning_rate": 0.0001686660503040737, + "loss": 0.8056, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.5944677864022473, + "learning_rate": 0.00016828825312940592, + "loss": 0.9485, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5442944872131958, + "learning_rate": 0.0001679086206865886, + "loss": 0.8904, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.46835436930261953, + "learning_rate": 0.00016752716317836229, + "loss": 0.8104, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.5506574284037457, + "learning_rate": 0.0001671438908565167, + "loss": 0.8928, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.6866455236359058, + "learning_rate": 0.00016675881402161536, + "loss": 1.0329, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5153754305561259, + "learning_rate": 0.0001663719430227186, + "loss": 0.8631, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.5421279505788661, + "learning_rate": 0.00016598328825710533, + "loss": 0.9095, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.492670751330977, + "learning_rate": 0.000165592860169994, + "loss": 0.8032, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.5320609790810602, + "learning_rate": 0.00016520066925426144, + "loss": 0.8853, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.6153617937266217, + "learning_rate": 0.0001648067260501611, + "loss": 0.9551, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.5870908953633535, + "learning_rate": 0.0001644110411450398, + "loss": 0.8036, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.5717943137734841, + "learning_rate": 0.00016401362517305296, + "loss": 0.9023, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5250490799889618, + "learning_rate": 0.00016361448881487914, + "loss": 0.8634, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.4879114155947266, + "learning_rate": 0.00016321364279743266, + "loss": 0.8236, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.5500009660496658, + "learning_rate": 0.0001628110978935756, + "loss": 0.9207, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.5064159311989878, + "learning_rate": 0.00016240686492182804, + "loss": 0.8665, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.5204543664192307, + "learning_rate": 0.00016200095474607753, + "loss": 0.8569, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.47615090357183554, + "learning_rate": 0.00016159337827528685, + "loss": 0.8748, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.46986745059184, + "learning_rate": 0.0001611841464632011, + "loss": 0.8208, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.5662189526408826, + "learning_rate": 0.0001607732703080532, + "loss": 0.9079, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.4614771187559411, + "learning_rate": 0.00016036076085226814, + "loss": 0.8955, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4128655696266702, + "learning_rate": 0.0001599466291821666, + "loss": 0.73, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.46980684931672995, + "learning_rate": 0.0001595308864276666, + "loss": 0.8642, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.503751885111121, + "learning_rate": 0.0001591135437619847, + "loss": 0.898, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.5246864852992136, + "learning_rate": 0.0001586946124013354, + "loss": 0.7843, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.6556184227171945, + "learning_rate": 0.0001582741036046301, + "loss": 0.9596, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.5452081719478181, + "learning_rate": 0.00015785202867317407, + "loss": 0.8479, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4689338175719907, + "learning_rate": 0.00015742839895036305, + "loss": 0.8096, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.5340978809935831, + "learning_rate": 0.00015700322582137827, + "loss": 0.8197, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5104927446857357, + "learning_rate": 0.0001565765207128805, + "loss": 0.8107, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.5383843138676714, + "learning_rate": 0.0001561482950927029, + "loss": 0.8387, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.4971648262048712, + "learning_rate": 0.00015571856046954285, + "loss": 0.8195, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.536028429366966, + "learning_rate": 0.00015528732839265272, + "loss": 0.7823, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.49573802462995736, + "learning_rate": 0.0001548546104515294, + "loss": 0.8979, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.6504266678543889, + "learning_rate": 0.00015442041827560274, + "loss": 0.9873, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5493979306274729, + "learning_rate": 0.00015398476353392323, + "loss": 0.909, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.607176966768405, + "learning_rate": 0.00015354765793484834, + "loss": 0.9769, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.6026217081429099, + "learning_rate": 0.00015310911322572753, + "loss": 0.9078, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.8260546535143017, + "learning_rate": 0.000152669141192587, + "loss": 0.9109, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.565042480096335, + "learning_rate": 0.00015222775365981273, + "loss": 0.9683, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.6631641156663064, + "learning_rate": 0.00015178496248983254, + "loss": 0.8976, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.451467688196003, + "learning_rate": 0.00015134077958279765, + "loss": 0.7868, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5062873359160209, + "learning_rate": 0.00015089521687626243, + "loss": 0.844, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.5505012757477366, + "learning_rate": 0.000150448286344864, + "loss": 0.9056, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.48791654606213525, + "learning_rate": 0.00015000000000000001, + "loss": 0.772, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4867542459090678, + "learning_rate": 0.00014955036988950618, + "loss": 0.7957, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4666455643040047, + "learning_rate": 0.00014909940809733222, + "loss": 0.8445, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.7241971297993525, + "learning_rate": 0.00014864712674321734, + "loss": 0.8769, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.5014548367408045, + "learning_rate": 0.00014819353798236427, + "loss": 0.6988, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.5485974010726679, + "learning_rate": 0.00014773865400511272, + "loss": 0.7829, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.6326692826101251, + "learning_rate": 0.00014728248703661182, + "loss": 0.7927, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.6754768867285195, + "learning_rate": 0.00014682504933649144, + "loss": 0.9564, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5253054477228808, + "learning_rate": 0.00014636635319853275, + "loss": 0.8997, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.5032148716031363, + "learning_rate": 0.00014590641095033787, + "loss": 0.7991, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.5873720857985434, + "learning_rate": 0.00014544523495299842, + "loss": 0.875, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.536716757908386, + "learning_rate": 0.0001449828376007636, + "loss": 0.9323, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.5329601916499632, + "learning_rate": 0.0001445192313207067, + "loss": 0.8414, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.6133109203613764, + "learning_rate": 0.0001440544285723915, + "loss": 0.9213, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.5420958304531465, + "learning_rate": 0.00014358844184753712, + "loss": 0.7188, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.472702891895565, + "learning_rate": 0.00014312128366968243, + "loss": 0.842, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.5724255421667166, + "learning_rate": 0.00014265296659384956, + "loss": 0.7698, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.5598104974516404, + "learning_rate": 0.00014218350320620624, + "loss": 0.7986, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.4895677463312306, + "learning_rate": 0.0001417129061237278, + "loss": 0.8813, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.5162603592848155, + "learning_rate": 0.00014124118799385796, + "loss": 0.8845, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.5869776595758642, + "learning_rate": 0.00014076836149416887, + "loss": 0.861, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5617615598123941, + "learning_rate": 0.0001402944393320206, + "loss": 0.9071, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.47855681145062084, + "learning_rate": 0.00013981943424421932, + "loss": 0.7601, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.604374905007272, + "learning_rate": 0.00013934335899667527, + "loss": 0.8423, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.597112711619441, + "learning_rate": 0.00013886622638405952, + "loss": 0.902, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.49367528291776197, + "learning_rate": 0.00013838804922946027, + "loss": 0.7977, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.46024520939954344, + "learning_rate": 0.00013790884038403795, + "loss": 0.823, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.5064679136575085, + "learning_rate": 0.00013742861272668012, + "loss": 0.8547, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.4681231694973576, + "learning_rate": 0.00013694737916365517, + "loss": 0.7197, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.5194132813288855, + "learning_rate": 0.00013646515262826552, + "loss": 0.7684, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.44168937865030244, + "learning_rate": 0.0001359819460805001, + "loss": 0.7906, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5363492869729305, + "learning_rate": 0.0001354977725066859, + "loss": 0.8445, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.5630884118990156, + "learning_rate": 0.00013501264491913906, + "loss": 0.8903, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.5421091365259456, + "learning_rate": 0.0001345265763558152, + "loss": 0.7832, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.5976361727040347, + "learning_rate": 0.00013403957987995882, + "loss": 0.905, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.7722641770049049, + "learning_rate": 0.0001335516685797525, + "loss": 0.951, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.5342617296373594, + "learning_rate": 0.00013306285556796495, + "loss": 0.7965, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4493539228425194, + "learning_rate": 0.00013257315398159864, + "loss": 0.8467, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.42445309368198597, + "learning_rate": 0.00013208257698153677, + "loss": 0.7371, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.6110142215922019, + "learning_rate": 0.00013159113775218964, + "loss": 0.8311, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4171728629188897, + "learning_rate": 0.00013109884950114007, + "loss": 0.8136, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4145065262066619, + "learning_rate": 0.00013060572545878875, + "loss": 0.7543, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.39505478649502457, + "learning_rate": 0.00013011177887799845, + "loss": 0.7792, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4750706454904371, + "learning_rate": 0.00012961702303373795, + "loss": 0.844, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.5645172644167065, + "learning_rate": 0.00012912147122272523, + "loss": 0.8761, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.5047096633256831, + "learning_rate": 0.00012862513676307008, + "loss": 0.8484, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.5466855524580382, + "learning_rate": 0.00012812803299391628, + "loss": 0.7414, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4527502164076956, + "learning_rate": 0.00012763017327508305, + "loss": 0.7615, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.621948024176012, + "learning_rate": 0.0001271315709867059, + "loss": 1.0494, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.5278015972675635, + "learning_rate": 0.00012663223952887723, + "loss": 0.7699, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4788403912458289, + "learning_rate": 0.00012613219232128608, + "loss": 0.8666, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.6796695538593274, + "learning_rate": 0.00012563144280285741, + "loss": 1.0207, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.5094336585671372, + "learning_rate": 0.00012513000443139112, + "loss": 0.793, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.44382775649854656, + "learning_rate": 0.00012462789068320017, + "loss": 0.7909, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.5464019050437547, + "learning_rate": 0.00012412511505274844, + "loss": 0.7833, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.440164784544212, + "learning_rate": 0.00012362169105228826, + "loss": 0.8175, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.523340673372237, + "learning_rate": 0.000123117632211497, + "loss": 0.8794, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.46938605542367634, + "learning_rate": 0.00012261295207711346, + "loss": 0.7615, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.6612564034378013, + "learning_rate": 0.0001221076642125742, + "loss": 0.8776, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.5198656780429122, + "learning_rate": 0.00012160178219764837, + "loss": 0.867, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.5849180701499528, + "learning_rate": 0.00012109531962807332, + "loss": 0.9267, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.445922051992169, + "learning_rate": 0.00012058829011518896, + "loss": 0.809, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.5233120278969384, + "learning_rate": 0.00012008070728557186, + "loss": 0.8476, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4467545901087636, + "learning_rate": 0.00011957258478066931, + "loss": 0.8565, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.5419164891412087, + "learning_rate": 0.00011906393625643244, + "loss": 0.8012, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.49153110184299614, + "learning_rate": 0.00011855477538294935, + "loss": 0.7781, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.441225854649443, + "learning_rate": 0.00011804511584407763, + "loss": 0.7527, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5420293577345939, + "learning_rate": 0.00011753497133707679, + "loss": 0.8052, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.621160158670621, + "learning_rate": 0.00011702435557223987, + "loss": 0.8041, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.6226969416632548, + "learning_rate": 0.00011651328227252517, + "loss": 0.7079, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.46305532658623033, + "learning_rate": 0.00011600176517318741, + "loss": 0.79, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.512448960407912, + "learning_rate": 0.00011548981802140848, + "loss": 0.8365, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4574034506128639, + "learning_rate": 0.00011497745457592816, + "loss": 0.7266, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4887805022058899, + "learning_rate": 0.00011446468860667421, + "loss": 0.7922, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.5008127484650859, + "learning_rate": 0.00011395153389439233, + "loss": 0.8055, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6533437588644191, + "learning_rate": 0.00011343800423027582, + "loss": 0.9421, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.43865513025574515, + "learning_rate": 0.0001129241134155949, + "loss": 0.8206, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.500355747105389, + "learning_rate": 0.00011240987526132594, + "loss": 0.7647, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.5046542253194481, + "learning_rate": 0.00011189530358778005, + "loss": 0.7697, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.47989589310254305, + "learning_rate": 0.00011138041222423177, + "loss": 0.7974, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.45532907895699337, + "learning_rate": 0.00011086521500854745, + "loss": 0.7756, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.43962213578637105, + "learning_rate": 0.00011034972578681338, + "loss": 0.8162, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.5606074664098125, + "learning_rate": 0.00010983395841296348, + "loss": 0.8901, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5079830206160058, + "learning_rate": 0.00010931792674840718, + "loss": 0.8149, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.5085969014108155, + "learning_rate": 0.00010880164466165674, + "loss": 0.8234, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.47466515207812976, + "learning_rate": 0.00010828512602795462, + "loss": 0.7461, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.46289215202445777, + "learning_rate": 0.00010776838472890065, + "loss": 0.7507, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4921323175256616, + "learning_rate": 0.00010725143465207867, + "loss": 0.8246, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5238576407371962, + "learning_rate": 0.00010673428969068364, + "loss": 0.8353, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.5521943239755719, + "learning_rate": 0.00010621696374314807, + "loss": 0.8444, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.49431425624314157, + "learning_rate": 0.00010569947071276847, + "loss": 0.8645, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.7208355365716979, + "learning_rate": 0.00010518182450733186, + "loss": 0.9068, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.41838050675402194, + "learning_rate": 0.00010466403903874176, + "loss": 0.7234, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5633323218709908, + "learning_rate": 0.00010414612822264455, + "loss": 0.9846, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.5331866506375034, + "learning_rate": 0.00010362810597805526, + "loss": 0.8208, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.45965572502528623, + "learning_rate": 0.0001031099862269837, + "loss": 0.7831, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4637670762530522, + "learning_rate": 0.00010259178289406011, + "loss": 0.8461, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.5027914914138983, + "learning_rate": 0.00010207350990616107, + "loss": 0.8697, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.4593080370419945, + "learning_rate": 0.0001015551811920351, + "loss": 0.748, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.534380283754602, + "learning_rate": 0.00010103681068192845, + "loss": 0.8775, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.5399591394604326, + "learning_rate": 0.00010051841230721065, + "loss": 0.8975, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.5168919839273327, + "learning_rate": 0.0001, + "loss": 0.7658, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.698638003373429, + "learning_rate": 9.948158769278939e-05, + "loss": 0.9027, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4983715615160177, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7335, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.4808635157821013, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7875, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6031685623744499, + "learning_rate": 9.792649009383899e-05, + "loss": 0.936, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.5441544394396668, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7774, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.47565093435532546, + "learning_rate": 9.689001377301633e-05, + "loss": 0.8093, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4429665536441755, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7513, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.5070355088304355, + "learning_rate": 9.585387177735547e-05, + "loss": 0.8311, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.4754187913585242, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7966, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.422923847295543, + "learning_rate": 9.481817549266817e-05, + "loss": 0.736, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4359887732874065, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7062, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4390223413713934, + "learning_rate": 9.378303625685195e-05, + "loss": 0.8652, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.4118609935094733, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7834, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4211081945079039, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7597, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.45862514356503903, + "learning_rate": 9.223161527109937e-05, + "loss": 0.8133, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5068072677401498, + "learning_rate": 9.171487397204539e-05, + "loss": 0.8107, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.5341515575737829, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7058, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.4189755931038567, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6702, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.5439267809005683, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7915, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5889477087996099, + "learning_rate": 8.965027421318665e-05, + "loss": 0.8669, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.429007878832156, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7741, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.5468616463299769, + "learning_rate": 8.861958777576827e-05, + "loss": 0.8144, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.5383443070500786, + "learning_rate": 8.810469641222001e-05, + "loss": 0.8272, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.40094297792258693, + "learning_rate": 8.759012473867407e-05, + "loss": 0.698, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.5756884008992263, + "learning_rate": 8.707588658440511e-05, + "loss": 0.8481, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.43047632372839495, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6932, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.5593530796265426, + "learning_rate": 8.604846610560771e-05, + "loss": 0.8549, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.4780192226188432, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7478, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.6674983319766304, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7739, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.44192395234058857, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7825, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.41801756105187, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6727, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.43331326662765, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6991, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.5102144606411114, + "learning_rate": 8.297564442776014e-05, + "loss": 0.8652, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4470810685284007, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7642, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.46452889334302405, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7703, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5029399250037823, + "learning_rate": 8.144522461705067e-05, + "loss": 0.778, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5430178947137475, + "learning_rate": 8.093606374356759e-05, + "loss": 0.899, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.6073099557016157, + "learning_rate": 8.042741521933071e-05, + "loss": 0.8754, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.5139804486458024, + "learning_rate": 7.991929271442817e-05, + "loss": 0.8326, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5099495498357821, + "learning_rate": 7.941170988481108e-05, + "loss": 0.8307, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.5524566640249836, + "learning_rate": 7.89046803719267e-05, + "loss": 0.8825, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.48527919854222334, + "learning_rate": 7.839821780235168e-05, + "loss": 0.8023, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.5126055869339473, + "learning_rate": 7.789233578742582e-05, + "loss": 0.8201, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.5120298082033016, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7424, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.5961778683910078, + "learning_rate": 7.688236778850306e-05, + "loss": 0.8604, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.5154840624172337, + "learning_rate": 7.637830894771175e-05, + "loss": 0.8931, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4415433685673907, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7059, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.43716725195073497, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7618, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.44773956762975003, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7404, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.5878875504156181, + "learning_rate": 7.43685571971426e-05, + "loss": 0.9092, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.40954711385606696, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7131, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.43823375388049757, + "learning_rate": 7.336776047112276e-05, + "loss": 0.8534, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.5281994591442473, + "learning_rate": 7.286842901329412e-05, + "loss": 0.8525, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.6216439994690285, + "learning_rate": 7.236982672491698e-05, + "loss": 0.912, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.5369931377412475, + "learning_rate": 7.187196700608373e-05, + "loss": 0.8762, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.42704156401937954, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7276, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5406466868308399, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7516, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.4561752776969113, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7388, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.47229546061123273, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7843, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4325737623515653, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7616, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.49204149089209853, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7128, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.559562407478129, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7505, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.6570003569302663, + "learning_rate": 6.791742301846326e-05, + "loss": 0.758, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.621282987201334, + "learning_rate": 6.742684601840141e-05, + "loss": 0.8657, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4708716086815571, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7436, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.6155562873756527, + "learning_rate": 6.644833142024751e-05, + "loss": 0.8836, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.5301712023276384, + "learning_rate": 6.59604201200412e-05, + "loss": 0.8209, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.4583907825180325, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7899, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.5170408420260697, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7679, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.43221213770041583, + "learning_rate": 6.450222749331414e-05, + "loss": 0.8044, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.45574898169048583, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7414, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4493726577026947, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7452, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.49372481319737627, + "learning_rate": 6.305262083634488e-05, + "loss": 0.8363, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4999818684934643, + "learning_rate": 6.25713872733199e-05, + "loss": 0.778, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.4717466234568454, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7671, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.6376163660666853, + "learning_rate": 6.161195077053976e-05, + "loss": 0.8994, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.34455217736810373, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6036, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.4902974603118997, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7374, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.49017582100179835, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7625, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5787294205160753, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7998, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.5165118760391255, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7682, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.49921952578607487, + "learning_rate": 5.875881200614207e-05, + "loss": 0.8486, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.4209301751735459, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7568, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.5408122468777863, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7727, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.4796961423122955, + "learning_rate": 5.73470334061505e-05, + "loss": 0.8001, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5641615170401117, + "learning_rate": 5.687871633031754e-05, + "loss": 0.8703, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5165229219449532, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.8715, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.5041378256889354, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7968, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.7716443536792185, + "learning_rate": 5.54807686792933e-05, + "loss": 0.993, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4854334707630958, + "learning_rate": 5.501716239923642e-05, + "loss": 0.8109, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4720714705256159, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.797, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.5897099387394112, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.9193, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.4629166003638903, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7839, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.5081906721526724, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7701, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.464097091171769, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7314, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.48248576515473834, + "learning_rate": 5.226134599488728e-05, + "loss": 0.8174, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.5692971171059747, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7447, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.5419683355113112, + "learning_rate": 5.135287325678271e-05, + "loss": 0.8103, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.45507138238142675, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7874, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.481578233270933, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.8005, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.436227883746338, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6995, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.47298332089542805, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7025, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.6106500045165093, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.8114, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.47740320285622895, + "learning_rate": 4.865922041720239e-05, + "loss": 0.765, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.702214183673606, + "learning_rate": 4.821503751016746e-05, + "loss": 0.8412, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5562422215336019, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7733, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.4086239010126385, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7158, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.4250546088154585, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6981, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.5252528262159712, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7999, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5250093628028863, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7516, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.44944119057037624, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6996, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.373339469396088, + "learning_rate": 4.514538954847064e-05, + "loss": 0.702, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.5091404542609552, + "learning_rate": 4.471267160734731e-05, + "loss": 0.8324, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4559081840121177, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7937, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.5870073860406521, + "learning_rate": 4.385170490729712e-05, + "loss": 0.8393, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.41024239467456186, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7166, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.47804588242983204, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.8401, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.4462204893506717, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7685, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.4025251470225496, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7112, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.6927721113610363, + "learning_rate": 4.172589639536991e-05, + "loss": 0.789, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4873568417670427, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7658, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.548380044705112, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7898, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.45470480619928993, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7298, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.653472115293909, + "learning_rate": 4.00533708178334e-05, + "loss": 0.9265, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5147624023346475, + "learning_rate": 3.963923914773187e-05, + "loss": 0.8142, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5852532764292365, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6888, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.38289933675282234, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6713, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.5745378908728875, + "learning_rate": 3.840662172471315e-05, + "loss": 0.8259, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.5227230237807072, + "learning_rate": 3.79990452539225e-05, + "loss": 0.7907, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.48336776108580554, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7115, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.3610806178212394, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.651, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.66297172148611, + "learning_rate": 3.678635720256737e-05, + "loss": 0.9544, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.44161095809461837, + "learning_rate": 3.638551118512089e-05, + "loss": 0.7518, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4575633460607062, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7552, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.722723677588711, + "learning_rate": 3.558895885496023e-05, + "loss": 0.866, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.575276403486101, + "learning_rate": 3.519327394983888e-05, + "loss": 0.8554, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.45820156110847365, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7398, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.39389106563921494, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7677, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4857233278601645, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7849, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5844203559113743, + "learning_rate": 3.362805697728145e-05, + "loss": 0.8875, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.47011962025880005, + "learning_rate": 3.324118597838464e-05, + "loss": 0.8164, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5636903977516472, + "learning_rate": 3.285610914348332e-05, + "loss": 0.956, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.5482145020390884, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.8175, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4676641774885784, + "learning_rate": 3.209137931341143e-05, + "loss": 0.682, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.444258649331897, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7999, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5017393200629316, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7675, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.4951559521909207, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7777, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.5125666277336168, + "learning_rate": 3.058390171511196e-05, + "loss": 0.7507, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.7750273597653224, + "learning_rate": 3.021167106673928e-05, + "loss": 0.8524, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4861304027210477, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7753, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4889696238512044, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.7927, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.40222117675260893, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7612, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.5650093895753036, + "learning_rate": 2.874160358524931e-05, + "loss": 0.8037, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.43315491250760535, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6886, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.5462208159205092, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.8101, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.6026763729016966, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.8718, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.4046841435686921, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7625, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.4671850132560188, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6781, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.6932221888293731, + "learning_rate": 2.659414712405398e-05, + "loss": 0.7224, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4637541683704292, + "learning_rate": 2.6243086879379e-05, + "loss": 0.8209, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.558805849823971, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.741, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4547010934608763, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7943, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.5749560405477635, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.9117, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.43805563386820356, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7424, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.534952948286577, + "learning_rate": 2.451770608467432e-05, + "loss": 0.8127, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.6362228916834066, + "learning_rate": 2.417867893002387e-05, + "loss": 0.9701, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.45720848528485347, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.737, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.5789284241390055, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7442, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.569155336267483, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7617, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.443122709439156, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.7245, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.4574876463725705, + "learning_rate": 2.251428928971102e-05, + "loss": 0.7237, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.45389616141071737, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7499, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.5023965935902118, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7202, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4217911819254415, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.7833, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.5839434139720753, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.8005, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5969424275607362, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.8378, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.45194175004812703, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6436, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.47053496931751265, + "learning_rate": 2.027184594300898e-05, + "loss": 0.7197, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.6773280472521463, + "learning_rate": 1.995999968955641e-05, + "loss": 0.9276, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5530705397733027, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.9194, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4372528048780638, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7198, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4732575173494202, + "learning_rate": 1.903740076395151e-05, + "loss": 0.75, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.4720051082167063, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7499, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.5569199783528601, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.845, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.49977114616282897, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.8465, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4653277907089936, + "learning_rate": 1.783776873795994e-05, + "loss": 0.8465, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.7261918299101087, + "learning_rate": 1.754336106761927e-05, + "loss": 0.8393, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.4956959003699411, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7907, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.3960118701005354, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7214, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.5451865636264599, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.7114, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.48667029153582564, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7863, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.41232333715100755, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6882, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.5627067828043002, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.8143, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.46843238997424697, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.723, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4972667220593963, + "learning_rate": 1.526852950422226e-05, + "loss": 0.7925, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4073474634815315, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6979, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.6293974773116691, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.9463, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.509981295029035, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.8322, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.5875459366532086, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.9178, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4584914305425312, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.696, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.8617998536964497, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.9026, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.49806760390577076, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6906, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.44225597493758884, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.7245, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4401043803864672, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7212, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.4323572558722791, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7397, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.43876814969221634, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7483, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3798739027513692, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6692, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.5945235758940287, + "learning_rate": 1.1884357690404158e-05, + "loss": 1.0929, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4825115362383566, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7331, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.6299236378392342, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7812, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.5666011324808448, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.8569, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.5553744921369806, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.8124, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.510333686902676, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.816, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.49559900235242754, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7089, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.46933367880068116, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7866, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3778769997968228, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7081, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.6113594137978235, + "learning_rate": 9.774976338718677e-06, + "loss": 0.8307, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.9287695702050881, + "learning_rate": 9.552642710005299e-06, + "loss": 0.7441, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.5382515041385985, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7369, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.4159002164402231, + "learning_rate": 9.115273765538202e-06, + "loss": 0.7241, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.6291581385668585, + "learning_rate": 8.900250204211514e-06, + "loss": 0.7429, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4753877904232731, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7765, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.5872610973791114, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7532, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4498892421917224, + "learning_rate": 8.269892311900696e-06, + "loss": 0.7024, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.8123422803137543, + "learning_rate": 8.064696101776358e-06, + "loss": 0.779, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.41929481717214745, + "learning_rate": 7.861970681683051e-06, + "loss": 0.7372, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.5222859963183167, + "learning_rate": 7.661721499929753e-06, + "loss": 0.8549, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.6670441830142942, + "learning_rate": 7.463953938275858e-06, + "loss": 0.8975, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.43915978436950875, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.7522, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4456511443247101, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6921, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4615977748085632, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.7786, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.38678529681334994, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7327, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.5884446912909952, + "learning_rate": 6.512524116523633e-06, + "loss": 0.8667, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.42588880798128614, + "learning_rate": 6.329755547632499e-06, + "loss": 0.7183, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.4973555744472622, + "learning_rate": 6.149504395842087e-06, + "loss": 0.759, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.44126780350963773, + "learning_rate": 5.971775505458444e-06, + "loss": 0.839, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5496050304014765, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7896, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.4314543635541692, + "learning_rate": 5.623903547074549e-06, + "loss": 0.7548, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4939857171123341, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6889, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.6803546939689971, + "learning_rate": 5.286177068899989e-06, + "loss": 0.8241, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4302650402092962, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7462, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.43983569426056496, + "learning_rate": 4.95863237670956e-06, + "loss": 0.7119, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.5166886782701035, + "learning_rate": 4.798689246727006e-06, + "loss": 0.7505, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.7389775548358781, + "learning_rate": 4.641304681730641e-06, + "loss": 0.9254, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.6000213315619103, + "learning_rate": 4.486482911479839e-06, + "loss": 0.8382, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4246772837815841, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.7941, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4927044853629139, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7817, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.4437912304260506, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7978, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.41022730357927784, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7691, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.5113800945371163, + "learning_rate": 3.750959195463466e-06, + "loss": 0.8154, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.37807366451747665, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6815, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.6703509003684044, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.7944, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.49467576514381967, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7308, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.545773796154925, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7476, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.46733782288285114, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.8258, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.43459441201655236, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7427, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.42114086561672265, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.7156, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.4923794610529232, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7756, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4677723291090093, + "learning_rate": 2.590275647868867e-06, + "loss": 0.7597, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5635557595289095, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.8143, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.45298397318059835, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7112, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5286896161816468, + "learning_rate": 2.250383684694579e-06, + "loss": 0.8504, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.4430491475058911, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7598, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4769554151753007, + "learning_rate": 2.036919225091827e-06, + "loss": 0.8451, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.41206085619808797, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7857, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4534131127881824, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.7216, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.5007808999973069, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.7748, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.7024634335260679, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.7737, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.5955987116221747, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.8271, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.46472634520405853, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7521, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.5521989073266615, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.8539, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.38375503604814787, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6662, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.5227163985214974, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.8451, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4462907521764808, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.7404, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.5570230980140358, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6947, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.595493579967642, + "learning_rate": 9.780089980330642e-07, + "loss": 0.9008, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.46527357099134, + "learning_rate": 9.070131527609604e-07, + "loss": 0.803, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.5069357035703678, + "learning_rate": 8.386804624865851e-07, + "loss": 0.8218, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.49345661890932047, + "learning_rate": 7.730127636723539e-07, + "loss": 0.7617, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5005185587015021, + "learning_rate": 7.100118211581852e-07, + "loss": 0.868, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.5767711807104086, + "learning_rate": 6.496793281141056e-07, + "loss": 0.764, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4629486587330603, + "learning_rate": 5.920169059947411e-07, + "loss": 0.7673, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.4937838492946445, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7787, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.45757088915353605, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7826, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.5804890411250316, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.8319, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.505528222478651, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6136, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.5384893329418087, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.8173, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.5356010978715702, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.8947, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.5494221178715695, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.8192, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.39234423655748984, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6581, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.41294804437037486, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6828, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.43510811411794426, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.7168, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.4369476079696727, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.7076, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4669451243115884, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.8218, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.48882962319266837, + "learning_rate": 8.598886661895788e-08, + "loss": 0.811, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.49329878789885573, + "learning_rate": 6.583743778106887e-08, + "loss": 0.7949, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.7210438768172956, + "learning_rate": 4.837177080119215e-08, + "loss": 0.8171, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.4174504571366966, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6792, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4766108218070693, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7452, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.569957272402524, + "learning_rate": 1.209367398504746e-08, + "loss": 0.8384, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4438365781454997, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7255, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.46433683282643373, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.7341, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.5478266890770399, + "learning_rate": 0.0, + "loss": 0.8425, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 503858269749248.0, + "train_loss": 0.8357635734558105, + "train_runtime": 9226.2551, + "train_samples_per_second": 1.084, + "train_steps_per_second": 0.068 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 503858269749248.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1cfe11e900767f08bfbb608c972a3e25ffe8016c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj", + "v_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e1cd9633da34311c727be24d824c74e843e44e1b --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:509f4fe725368c830f31a07e9dfa637e7409475215bca031f12ab92ad28fc1d5 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..0a847979c5e08ea1c20375bbacfd924f19b9a600 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cfa623b92512d18b2427e22625319dbb311b37891dcfc42a982b687758f3ac6 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..737baaad191e95a5021aa1d7fee8d0288f129c00 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.8230906866528206, + "learning_rate": 2e-05, + "loss": 1.3718, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8011482349224612, + "learning_rate": 4e-05, + "loss": 1.3884, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6777465654408708, + "learning_rate": 6e-05, + "loss": 1.2942, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6985842469244755, + "learning_rate": 8e-05, + "loss": 1.3011, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.6358810285065617, + "learning_rate": 0.0001, + "loss": 1.2236, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.8043615850550121, + "learning_rate": 0.00012, + "loss": 1.1222, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8766371190581345, + "learning_rate": 0.00014, + "loss": 1.0481, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5896469063055397, + "learning_rate": 0.00016, + "loss": 1.0115, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5140042859696677, + "learning_rate": 0.00018, + "loss": 1.0032, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.47047377131530216, + "learning_rate": 0.0002, + "loss": 0.9718, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.4689357680369753, + "learning_rate": 0.00019999458931878073, + "loss": 1.0164, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.49055471890156765, + "learning_rate": 0.0001999783578606323, + "loss": 1.0478, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.48360570699467204, + "learning_rate": 0.00019995130738201966, + "loss": 1.0179, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4385882347361944, + "learning_rate": 0.0001999134408101731, + "loss": 0.8839, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.5842432990089418, + "learning_rate": 0.00019986476224277165, + "loss": 1.0378, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4798987625762095, + "learning_rate": 0.00019980527694749952, + "loss": 0.9014, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.43385099220728984, + "learning_rate": 0.00019973499136147606, + "loss": 0.9908, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4363177117395925, + "learning_rate": 0.0001996539130905593, + "loss": 0.9302, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4184869797333112, + "learning_rate": 0.0001995620509085228, + "loss": 0.9156, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.4093043649081553, + "learning_rate": 0.00019945941475610623, + "loss": 0.9582, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.40265123100906836, + "learning_rate": 0.0001993460157399396, + "loss": 0.8491, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.40866258779980846, + "learning_rate": 0.0001992218661313415, + "loss": 0.9515, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.47714968174592676, + "learning_rate": 0.00019908697936499103, + "loss": 0.983, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.3725347901708648, + "learning_rate": 0.00019894137003747403, + "loss": 0.9136, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.48768461879424546, + "learning_rate": 0.00019878505390570362, + "loss": 0.9486, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.46702127014841616, + "learning_rate": 0.00019861804788521493, + "loss": 0.9692, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.39034215128065525, + "learning_rate": 0.00019844037004833473, + "loss": 0.8706, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4730253707882492, + "learning_rate": 0.00019825203962222572, + "loss": 0.9078, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.3566887338551931, + "learning_rate": 0.0001980530769868059, + "loss": 0.8921, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.37413026403570376, + "learning_rate": 0.00019784350367254322, + "loss": 0.8723, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4494091415776092, + "learning_rate": 0.0001976233423581255, + "loss": 0.9454, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4380782092404729, + "learning_rate": 0.0001973926168680066, + "loss": 0.9703, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4024207233737277, + "learning_rate": 0.00019715135216982798, + "loss": 0.8925, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.40406197533093235, + "learning_rate": 0.0001968995743717171, + "loss": 0.9361, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.3618341982376051, + "learning_rate": 0.00019663731071946206, + "loss": 0.8614, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.3658614124884964, + "learning_rate": 0.00019636458959356316, + "loss": 0.8498, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5573838774300858, + "learning_rate": 0.0001960814405061619, + "loss": 0.8736, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.38557296694847226, + "learning_rate": 0.00019578789409784727, + "loss": 0.8929, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.40079642763917983, + "learning_rate": 0.00019548398213434007, + "loss": 0.8423, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.38961763082023404, + "learning_rate": 0.00019516973750305532, + "loss": 0.8835, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4099214309122936, + "learning_rate": 0.00019484519420954354, + "loss": 0.9405, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.39275472621957447, + "learning_rate": 0.00019451038737381077, + "loss": 0.8549, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.37890491998796727, + "learning_rate": 0.00019416535322651818, + "loss": 0.9153, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3677543896244243, + "learning_rate": 0.00019381012910506146, + "loss": 0.8453, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.3798879355149729, + "learning_rate": 0.00019344475344953012, + "loss": 0.8136, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.39541256509109524, + "learning_rate": 0.00019306926579854821, + "loss": 0.8917, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3933918528877009, + "learning_rate": 0.00019268370678499533, + "loss": 0.8155, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.40698599023324944, + "learning_rate": 0.0001922881181316097, + "loss": 0.8666, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.40042005946240466, + "learning_rate": 0.00019188254264647337, + "loss": 0.8412, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.4607168760067279, + "learning_rate": 0.0001914670242183795, + "loss": 0.8868, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.39972125720735685, + "learning_rate": 0.0001910416078120832, + "loss": 0.8918, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4188280311405362, + "learning_rate": 0.0001906063394634356, + "loss": 0.902, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.42940697765165986, + "learning_rate": 0.00019016126627440237, + "loss": 0.8641, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3545329739920464, + "learning_rate": 0.00018970643640796642, + "loss": 0.8735, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.39694907744821073, + "learning_rate": 0.000189241899082916, + "loss": 0.7712, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3873462097880066, + "learning_rate": 0.00018876770456851877, + "loss": 0.8613, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.39944251069469355, + "learning_rate": 0.0001882839041790818, + "loss": 0.8342, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.38873644333253415, + "learning_rate": 0.00018779055026839868, + "loss": 0.8602, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.41970967893409644, + "learning_rate": 0.00018728769622408423, + "loss": 0.8577, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.3660391396843375, + "learning_rate": 0.00018677539646179707, + "loss": 0.8452, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4567680983316884, + "learning_rate": 0.00018625370641935129, + "loss": 0.9641, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4347858356915908, + "learning_rate": 0.00018572268255071718, + "loss": 0.9423, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3337070071383876, + "learning_rate": 0.00018518238231991218, + "loss": 0.8076, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4286830947909843, + "learning_rate": 0.00018463286419478255, + "loss": 0.9527, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.35295999710631726, + "learning_rate": 0.00018407418764067627, + "loss": 0.8146, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.40069838158976223, + "learning_rate": 0.00018350641311400812, + "loss": 0.9182, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.428778831044889, + "learning_rate": 0.0001829296020557174, + "loss": 0.9332, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.35155177053946185, + "learning_rate": 0.00018234381688461942, + "loss": 0.8472, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3393523853315711, + "learning_rate": 0.0001817491209906506, + "loss": 0.8116, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.39467587805954535, + "learning_rate": 0.00018114557872800905, + "loss": 0.8882, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.45738467437649594, + "learning_rate": 0.00018053325540819045, + "loss": 0.9899, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.40860575562909096, + "learning_rate": 0.0001799122172929206, + "loss": 0.8518, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.37358375585659853, + "learning_rate": 0.00017928253158698473, + "loss": 0.8671, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3614438681293151, + "learning_rate": 0.0001786442664309554, + "loss": 0.7581, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.39185167616797406, + "learning_rate": 0.0001779974908938184, + "loss": 0.8273, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3725033489438709, + "learning_rate": 0.0001773422749654988, + "loss": 0.7593, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3879109255323267, + "learning_rate": 0.00017667868954928694, + "loss": 0.8456, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3449533879291914, + "learning_rate": 0.00017600680645416583, + "loss": 0.8246, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.3557304717641571, + "learning_rate": 0.00017532669838704035, + "loss": 0.8002, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.39624711741320795, + "learning_rate": 0.00017463843894486937, + "loss": 0.845, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4040277144139735, + "learning_rate": 0.0001739421026067017, + "loss": 0.9255, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3573834496881195, + "learning_rate": 0.00017323776472561627, + "loss": 0.8664, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.39861349237455607, + "learning_rate": 0.00017252550152056795, + "loss": 0.869, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4122338593577007, + "learning_rate": 0.0001718053900681397, + "loss": 0.8542, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.38711562232847824, + "learning_rate": 0.00017107750829420176, + "loss": 0.8265, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.39947214892388133, + "learning_rate": 0.00017034193496547902, + "loss": 0.9227, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.36818193645695974, + "learning_rate": 0.00016959874968102735, + "loss": 0.8025, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3563867965150087, + "learning_rate": 0.00016884803286362, + "loss": 0.799, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3863926930438656, + "learning_rate": 0.00016808986575104465, + "loss": 0.9039, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.35592980299837496, + "learning_rate": 0.00016732433038731242, + "loss": 0.8311, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4204313739806602, + "learning_rate": 0.0001665515096137797, + "loss": 0.9226, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3723535694546385, + "learning_rate": 0.00016577148706018328, + "loss": 0.8311, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.41234782362203953, + "learning_rate": 0.00016498434713559088, + "loss": 0.8979, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4053425895286585, + "learning_rate": 0.00016419017501926656, + "loss": 0.8344, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.36505412209597476, + "learning_rate": 0.0001633890566514535, + "loss": 0.8345, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.382611792068928, + "learning_rate": 0.00016258107872407375, + "loss": 0.8749, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.37741695374713863, + "learning_rate": 0.0001617663286713474, + "loss": 0.8493, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4676602590760305, + "learning_rate": 0.00016094489466033043, + "loss": 0.8432, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.34327944895308454, + "learning_rate": 0.00016011686558137448, + "loss": 0.7974, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.3556557830313786, + "learning_rate": 0.0001592823310385073, + "loss": 0.8677, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4186166790777886, + "learning_rate": 0.0001584413813397364, + "loss": 0.8586, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.37112136732584244, + "learning_rate": 0.00015759410748727662, + "loss": 0.8209, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.38959686649650355, + "learning_rate": 0.00015674060116770236, + "loss": 0.8077, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.36221559570247797, + "learning_rate": 0.00015588095474202595, + "loss": 0.8142, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.3408395021986205, + "learning_rate": 0.00015501526123570277, + "loss": 0.825, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.46256890237184806, + "learning_rate": 0.00015414361432856475, + "loss": 0.9299, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4279687120096611, + "learning_rate": 0.0001532661083446829, + "loss": 0.9313, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.499080817408835, + "learning_rate": 0.00015238283824216015, + "loss": 0.9257, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3677888976110564, + "learning_rate": 0.00015149389960285558, + "loss": 0.8234, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.40300933145036566, + "learning_rate": 0.00015059938862204127, + "loss": 0.8661, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.35272019193019816, + "learning_rate": 0.00014969940209799248, + "loss": 0.7685, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.42072850665932426, + "learning_rate": 0.00014879403742151283, + "loss": 0.838, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3843947444096128, + "learning_rate": 0.00014788339256539544, + "loss": 0.7295, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.43191105712976785, + "learning_rate": 0.0001469675660738206, + "loss": 0.8535, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.3584146254614676, + "learning_rate": 0.00014604665705169237, + "loss": 0.8303, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4020587727121328, + "learning_rate": 0.00014512076515391375, + "loss": 0.8827, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.42244806005338637, + "learning_rate": 0.00014418999057460276, + "loss": 0.8637, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3490560388254176, + "learning_rate": 0.0001432544340362501, + "loss": 0.7685, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3610932978019899, + "learning_rate": 0.00014231419677881966, + "loss": 0.7689, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.35696651622274633, + "learning_rate": 0.00014136938054879283, + "loss": 0.8692, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.37982273980822234, + "learning_rate": 0.00014042008758815818, + "loss": 0.8633, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.49338348781038344, + "learning_rate": 0.00013946642062334766, + "loss": 0.7882, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.41407314429068576, + "learning_rate": 0.00013850848285411994, + "loss": 0.8352, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.35863668034543733, + "learning_rate": 0.000137546377942393, + "loss": 0.824, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.34638170296412163, + "learning_rate": 0.00013658021000102636, + "loss": 0.7332, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.33192770455483456, + "learning_rate": 0.00013561008358255468, + "loss": 0.8012, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4018840350246986, + "learning_rate": 0.00013463610366787392, + "loss": 0.8274, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.46935625670519754, + "learning_rate": 0.00013365837565488064, + "loss": 0.9137, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3467457829185042, + "learning_rate": 0.0001326770053470668, + "loss": 0.8057, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3713608562339026, + "learning_rate": 0.0001316920989420703, + "loss": 0.778, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.2953063312947437, + "learning_rate": 0.00013070376302018287, + "loss": 0.7741, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.315778283756295, + "learning_rate": 0.00012971210453281674, + "loss": 0.7968, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3777293731399955, + "learning_rate": 0.000128717230790931, + "loss": 0.8476, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.36565481085206214, + "learning_rate": 0.00012771924945341906, + "loss": 0.7424, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.40097648303660766, + "learning_rate": 0.00012671826851545851, + "loss": 0.8961, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4335469508570514, + "learning_rate": 0.0001257143962968246, + "loss": 0.9337, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3571253000150674, + "learning_rate": 0.00012470774143016853, + "loss": 0.7772, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4175995621863514, + "learning_rate": 0.00012369841284926188, + "loss": 0.7899, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3555584844534414, + "learning_rate": 0.00012268651977720866, + "loss": 0.8058, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.4478506698284743, + "learning_rate": 0.00012167217171462566, + "loss": 0.8602, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3924154628163152, + "learning_rate": 0.0001206554784277931, + "loss": 0.8566, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3413680221828951, + "learning_rate": 0.00011963654993677645, + "loss": 0.8442, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.37597687671024504, + "learning_rate": 0.00011861549650352069, + "loss": 0.7896, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.35508126757199976, + "learning_rate": 0.00011759242861991855, + "loss": 0.7702, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.35676098581829857, + "learning_rate": 0.00011656745699585371, + "loss": 0.7463, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.34262057715324995, + "learning_rate": 0.00011554069254722051, + "loss": 0.8023, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3160775424152765, + "learning_rate": 0.00011451224638392129, + "loss": 0.753, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.41408430747051145, + "learning_rate": 0.00011348222979784289, + "loss": 0.8722, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3807991104363706, + "learning_rate": 0.00011245075425081328, + "loss": 0.7902, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.3444267731692557, + "learning_rate": 0.00011141793136253986, + "loss": 0.7729, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3198788705719864, + "learning_rate": 0.0001103838728985307, + "loss": 0.7863, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.41645947104870057, + "learning_rate": 0.000109348690758, + "loss": 0.8442, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3473642274949473, + "learning_rate": 0.00010831249696175918, + "loss": 0.7755, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.351233087536269, + "learning_rate": 0.0001072754036400944, + "loss": 0.7799, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.39062438800982935, + "learning_rate": 0.00010623752302063283, + "loss": 0.8262, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4661216475962772, + "learning_rate": 0.00010519896741619803, + "loss": 0.8754, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3680543907349618, + "learning_rate": 0.00010415984921265609, + "loss": 0.8511, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3554554750391017, + "learning_rate": 0.00010312028085675391, + "loss": 0.7905, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.34003331830378203, + "learning_rate": 0.00010208037484395114, + "loss": 0.8518, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.35691497758794416, + "learning_rate": 0.00010104024370624644, + "loss": 0.8042, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3680532317532297, + "learning_rate": 0.0001, + "loss": 0.826, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4306874440922751, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8114, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3927827556296759, + "learning_rate": 9.791962515604887e-05, + "loss": 0.8477, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3688233147013582, + "learning_rate": 9.687971914324607e-05, + "loss": 0.787, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.3460381638988584, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7861, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3293873896538617, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7599, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.33067678398241424, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7788, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.30272346196435956, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7594, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.35694173144440655, + "learning_rate": 9.168750303824084e-05, + "loss": 0.8066, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.36394962483417836, + "learning_rate": 9.065130924199998e-05, + "loss": 0.6855, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4052142574936814, + "learning_rate": 8.961612710146934e-05, + "loss": 0.8238, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4466361193230218, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7949, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3691146825930911, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7591, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3884324905017403, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7671, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.37235345511558177, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7954, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.43342144747289674, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7674, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3601056959148259, + "learning_rate": 8.343254300414628e-05, + "loss": 0.6814, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.34765672643846884, + "learning_rate": 8.240757138008149e-05, + "loss": 0.8115, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3542918688839561, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7751, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.43405703341062163, + "learning_rate": 8.036345006322359e-05, + "loss": 0.883, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.35990538609349737, + "learning_rate": 7.934452157220694e-05, + "loss": 0.8274, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.36566849513888866, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8316, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3880712586476513, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7823, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.40070345592700185, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8682, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.452126500075801, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7349, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.39133686156881714, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8166, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.30433046880932474, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7799, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.38912324970709505, + "learning_rate": 7.228075054658096e-05, + "loss": 0.874, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.34571755403615634, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7972, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.3759979893898139, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7436, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3396623833976361, + "learning_rate": 6.929623697981718e-05, + "loss": 0.771, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3930329153264238, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7294, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.384939784498545, + "learning_rate": 6.732299465293322e-05, + "loss": 0.8072, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3670712491930503, + "learning_rate": 6.63416243451194e-05, + "loss": 0.8139, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.37471193485584176, + "learning_rate": 6.536389633212609e-05, + "loss": 0.8026, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3466481941938704, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7856, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.334681278560714, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7457, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3580934295092547, + "learning_rate": 6.245362205760704e-05, + "loss": 0.8062, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.41942104569534644, + "learning_rate": 6.149151714588009e-05, + "loss": 0.8307, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3044902604615677, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6707, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.37144444775815594, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7813, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.34248491706444256, + "learning_rate": 5.863061945120719e-05, + "loss": 0.8051, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.34748171044370163, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7627, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.38512453359767645, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8318, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.36707414698679175, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.8305, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.48217984342912773, + "learning_rate": 5.487923484608629e-05, + "loss": 0.9014, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.38863801931906744, + "learning_rate": 5.395334294830765e-05, + "loss": 0.8525, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.35518873432323794, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7778, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.34123260040379993, + "learning_rate": 5.211660743460458e-05, + "loss": 0.775, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.4001757666899788, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7731, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3315496723721597, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7922, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3258037685568246, + "learning_rate": 4.940061137795876e-05, + "loss": 0.699, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3884518590202934, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7866, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.7104627436824442, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8089, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3013941102836104, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7054, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3731169801442719, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7737, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.30263267234838137, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7029, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3659751206161679, + "learning_rate": 4.411904525797408e-05, + "loss": 0.8146, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.36421909907458466, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7769, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.3403495561782968, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8074, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.4116298314841219, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7561, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4017877316418641, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7802, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3933048903616485, + "learning_rate": 3.988313441862553e-05, + "loss": 0.8302, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4053820034136759, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7537, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.3422570001344906, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7513, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3672321719467038, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7552, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.38655052693729053, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.8076, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.32046006517959386, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7549, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.45524335147940526, + "learning_rate": 3.501565286440914e-05, + "loss": 0.8619, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.29405543478538154, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7585, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.38285886702055727, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.8357, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3801889952884557, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.8901, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.35279011141194516, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7481, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3471907142593491, + "learning_rate": 3.115196713638e-05, + "loss": 0.7855, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.36619187308979667, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7625, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3762428412156853, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.8157, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3171161168873978, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7799, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3592730455514312, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7478, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4134931519699558, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.8437, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.30546020641842614, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7224, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3454408457555957, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7727, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3680677506255253, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7745, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.36274461235683025, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.8305, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4459754472676995, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8917, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.34453176080889125, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7476, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.31196539201987367, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7468, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3184608885787424, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7402, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.36443373379957883, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7567, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4417924711660864, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.823, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.3236759284366048, + "learning_rate": 2.008778270707944e-05, + "loss": 0.6885, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.39242767874833534, + "learning_rate": 1.946674459180955e-05, + "loss": 0.9275, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3427482665469463, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7372, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3626863241157415, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.8017, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.34605782828987924, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.8484, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.4383327443511539, + "learning_rate": 1.707039794428259e-05, + "loss": 0.8186, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3447222581311919, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7213, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.31970536995245263, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.7434, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3656450810361097, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7676, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.36298269369872477, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7487, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.42152653173895593, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.8926, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.38946242385235036, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.8176, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.41804903127540954, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7996, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.31093486947817817, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7247, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3187140163567531, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7473, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.3872254324451249, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.8821, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4051984951107359, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7603, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.40122800893302063, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.8414, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.35538652933955217, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7676, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3137491139738254, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7501, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.45963025353581044, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7936, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3348526950988372, + "learning_rate": 8.958392187916841e-06, + "loss": 0.736, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.39896905950270944, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7654, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.37722430967890513, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7327, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.31478006761399024, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7615, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.4188263703262344, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.8783, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.31942616334359136, + "learning_rate": 6.930734201451816e-06, + "loss": 0.725, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.2997679274265762, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7615, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.36551094625988517, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7969, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.34379292762124014, + "learning_rate": 5.834646773481811e-06, + "loss": 0.8058, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.35790166821440883, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7782, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.33879760718838586, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7636, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.31407783216446544, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7323, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.38757334898188933, + "learning_rate": 4.516017865659949e-06, + "loss": 0.8443, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.39258218690616686, + "learning_rate": 4.21210590215273e-06, + "loss": 0.8223, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.3313155165113673, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7951, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.32605323503196637, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.799, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.393570523991794, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.741, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.37378520088314604, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7429, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3179898573134195, + "learning_rate": 2.848647830172024e-06, + "loss": 0.7852, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.33638562625723384, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7518, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.36906155605346674, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7948, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5865294985065448, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7865, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.3357869205531904, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.8067, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3214550435563662, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7578, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.3617697253780003, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7783, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3851342510588961, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7932, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.34426702777854773, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7609, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.36253585088408663, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7984, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4054570274935403, + "learning_rate": 9.130206350089765e-07, + "loss": 0.8, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.34938837916613863, + "learning_rate": 7.781338686584927e-07, + "loss": 0.8201, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.37777115892854124, + "learning_rate": 6.539842600603918e-07, + "loss": 0.8213, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.37766797195751295, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7736, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.34012917606966536, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7852, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.39195816403535333, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7284, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.38813789832520107, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.8645, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.32915922567617034, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7437, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3351156902277233, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7069, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.32461016981926744, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7698, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.34243368327755425, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.8092, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.35449923757603197, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7537, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.37595905935200546, + "learning_rate": 5.410681219286673e-09, + "loss": 0.7974, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3296041419119035, + "learning_rate": 0.0, + "loss": 0.7362, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 730661894488064.0, + "train_loss": 0.8330854674180349, + "train_runtime": 9213.2243, + "train_samples_per_second": 1.085, + "train_steps_per_second": 0.034 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 730661894488064.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..08e200f2a07bbf95ef9c06ac9018bcef73182e0b --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "q_proj", + "down_proj", + "v_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..20cd56d587ecd1f3fca7e27f515774a06db21e79 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb223c3820e4d3ba11a4de32d2e8233430f8ee290e3539d12aefaa3e3db8d9e5 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..b37acf92c0d29b3c161d187f94539b24e07eb742 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a0dab0cefd97719e632dbe34186b207b24721cd439d644851337f06d4b9e2aa +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b621d23102cafd7b8ffd5a04cba11d96194ecd15 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 1.0793138209817712, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4104, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8674660345497316, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2554, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8985758119498529, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3902, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7246031443112888, + "learning_rate": 4.210526315789474e-05, + "loss": 1.3101, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.6511872684793721, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.1316, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.763616935771609, + "learning_rate": 6.31578947368421e-05, + "loss": 1.3126, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.6908154183889081, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0942, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.2335001982189193, + "learning_rate": 8.421052631578948e-05, + "loss": 1.3125, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.9498545484386265, + "learning_rate": 9.473684210526316e-05, + "loss": 1.0454, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 1.0250421172188384, + "learning_rate": 0.00010526315789473685, + "loss": 1.1987, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 1.0356218006105504, + "learning_rate": 0.00011578947368421053, + "loss": 1.1966, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7691232611946629, + "learning_rate": 0.0001263157894736842, + "loss": 1.1452, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6137120435062169, + "learning_rate": 0.0001368421052631579, + "loss": 0.9656, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6555096542988432, + "learning_rate": 0.00014736842105263158, + "loss": 1.0269, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.6836787264972417, + "learning_rate": 0.00015789473684210527, + "loss": 1.107, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5915370874369371, + "learning_rate": 0.00016842105263157895, + "loss": 0.979, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.6193214806287105, + "learning_rate": 0.00017894736842105264, + "loss": 0.9109, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6553819808381135, + "learning_rate": 0.00018947368421052632, + "loss": 0.977, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.6607466103071827, + "learning_rate": 0.0002, + "loss": 0.9003, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.577074883316804, + "learning_rate": 0.00019999865623437013, + "loss": 0.8983, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.6291563797217955, + "learning_rate": 0.00019999462497359466, + "loss": 0.9503, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6851019336127845, + "learning_rate": 0.00019998790632601496, + "loss": 1.0646, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.6473518797136671, + "learning_rate": 0.0001999785004721968, + "loss": 1.0297, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5756352725362215, + "learning_rate": 0.00019996640766492543, + "loss": 0.8767, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.6520852714809134, + "learning_rate": 0.00019995162822919883, + "loss": 0.9888, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5591575247915785, + "learning_rate": 0.00019993416256221895, + "loss": 0.9146, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5223616842749025, + "learning_rate": 0.00019991401113338104, + "loss": 0.8748, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5701781592261361, + "learning_rate": 0.00019989117448426108, + "loss": 0.9207, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.6596981955668625, + "learning_rate": 0.00019986565322860115, + "loss": 0.9812, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.6389275851096986, + "learning_rate": 0.00019983744805229296, + "loss": 1.0121, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5692044640211616, + "learning_rate": 0.00019980655971335945, + "loss": 1.0032, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.6172755546909754, + "learning_rate": 0.00019977298904193437, + "loss": 0.9598, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5780978905199465, + "learning_rate": 0.00019973673694024, + "loss": 0.8745, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.49979542206264266, + "learning_rate": 0.00019969780438256293, + "loss": 0.8903, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.6544043706335699, + "learning_rate": 0.0001996561924152278, + "loss": 1.0492, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5269776535593357, + "learning_rate": 0.0001996119021565693, + "loss": 0.9326, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.5703236802291971, + "learning_rate": 0.0001995649347969019, + "loss": 0.8848, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5748680136894786, + "learning_rate": 0.00019951529159848805, + "loss": 1.0248, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.6715969458430749, + "learning_rate": 0.00019946297389550433, + "loss": 1.0063, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.532057621226445, + "learning_rate": 0.00019940798309400526, + "loss": 0.8946, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.4833052914364651, + "learning_rate": 0.0001993503206718859, + "loss": 0.8794, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5473279088698683, + "learning_rate": 0.00019928998817884182, + "loss": 0.9075, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4883523970636091, + "learning_rate": 0.00019922698723632767, + "loss": 0.8992, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5340051810138325, + "learning_rate": 0.00019916131953751342, + "loss": 0.9149, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.5064485228548891, + "learning_rate": 0.00019909298684723904, + "loss": 0.8637, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.7975601380952108, + "learning_rate": 0.00019902199100196697, + "loss": 1.0321, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.44587397079465824, + "learning_rate": 0.00019894833390973266, + "loss": 0.8226, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.48608845583225374, + "learning_rate": 0.00019887201755009357, + "loss": 0.8716, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5198898257438284, + "learning_rate": 0.0001987930439740757, + "loss": 0.8802, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.8158293298366603, + "learning_rate": 0.00019871141530411853, + "loss": 0.8988, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5781269309537154, + "learning_rate": 0.0001986271337340182, + "loss": 0.9523, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5945814159638966, + "learning_rate": 0.00019854020152886814, + "loss": 1.0096, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5087211255257024, + "learning_rate": 0.0001984506210249986, + "loss": 0.8375, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5910704844668925, + "learning_rate": 0.00019835839462991361, + "loss": 0.7949, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.6165737968134988, + "learning_rate": 0.00019826352482222638, + "loss": 0.9585, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5272556688243508, + "learning_rate": 0.00019816601415159263, + "loss": 0.8688, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.48991461637614614, + "learning_rate": 0.0001980658652386421, + "loss": 0.9111, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.605546363473904, + "learning_rate": 0.00019796308077490817, + "loss": 0.99, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.46970358749198515, + "learning_rate": 0.00019785766352275542, + "loss": 0.889, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.4323321438964964, + "learning_rate": 0.00019774961631530545, + "loss": 0.8242, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.6421355965110687, + "learning_rate": 0.00019763894205636072, + "loss": 1.0748, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4745832195507423, + "learning_rate": 0.00019752564372032657, + "loss": 0.8315, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.6807110202412574, + "learning_rate": 0.00019740972435213115, + "loss": 0.9852, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5516621767923496, + "learning_rate": 0.00019729118706714375, + "loss": 0.9619, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.5442774338710398, + "learning_rate": 0.00019717003505109095, + "loss": 0.8259, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.6137001844755123, + "learning_rate": 0.00019704627155997108, + "loss": 0.8962, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5438522046733677, + "learning_rate": 0.00019691989991996663, + "loss": 0.882, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.6269568278040801, + "learning_rate": 0.0001967909235273549, + "loss": 1.0232, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5683434963903127, + "learning_rate": 0.00019665934584841682, + "loss": 0.8979, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.5196024255658687, + "learning_rate": 0.00019652517041934356, + "loss": 0.9263, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4243653568771807, + "learning_rate": 0.00019638840084614182, + "loss": 0.8549, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.6071660525964858, + "learning_rate": 0.00019624904080453655, + "loss": 0.9677, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.5017063724170426, + "learning_rate": 0.00019610709403987246, + "loss": 0.7994, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5670301642307584, + "learning_rate": 0.00019596256436701324, + "loss": 0.974, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.5174898485382943, + "learning_rate": 0.000195815455670239, + "loss": 0.9166, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5727910841053342, + "learning_rate": 0.00019566577190314197, + "loss": 0.9744, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.5892412963128097, + "learning_rate": 0.0001955135170885202, + "loss": 0.9508, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.642838513538222, + "learning_rate": 0.00019535869531826937, + "loss": 0.9288, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5078172724450312, + "learning_rate": 0.00019520131075327298, + "loss": 0.8302, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.6104299828421814, + "learning_rate": 0.00019504136762329047, + "loss": 0.9432, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.5567938689379198, + "learning_rate": 0.00019487887022684336, + "loss": 0.9509, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5174359389523094, + "learning_rate": 0.00019471382293110003, + "loss": 0.8202, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.6102149265809388, + "learning_rate": 0.00019454623017175812, + "loss": 0.931, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.488489313365436, + "learning_rate": 0.00019437609645292546, + "loss": 0.9127, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.5191733814891298, + "learning_rate": 0.0001942034263469989, + "loss": 0.819, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.49796814189560956, + "learning_rate": 0.00019402822449454153, + "loss": 0.8484, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5380837654524147, + "learning_rate": 0.00019385049560415794, + "loss": 0.908, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5019138128050206, + "learning_rate": 0.00019367024445236754, + "loss": 0.8827, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.46651823153453, + "learning_rate": 0.00019348747588347637, + "loss": 0.7727, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.4986566878404386, + "learning_rate": 0.00019330219480944694, + "loss": 0.833, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4983672326086276, + "learning_rate": 0.00019311440620976597, + "loss": 0.7986, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.6330160448213425, + "learning_rate": 0.0001929241151313108, + "loss": 0.9544, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.44904646849391194, + "learning_rate": 0.00019273132668821364, + "loss": 0.7966, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4542692665928206, + "learning_rate": 0.00019253604606172417, + "loss": 0.8242, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.515424314483351, + "learning_rate": 0.00019233827850007027, + "loss": 0.9082, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5697447448543441, + "learning_rate": 0.00019213802931831696, + "loss": 0.9013, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.5137764674379561, + "learning_rate": 0.00019193530389822363, + "loss": 0.8137, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.479919016645595, + "learning_rate": 0.00019173010768809933, + "loss": 0.8398, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.49647779658563373, + "learning_rate": 0.0001915224462026563, + "loss": 0.9328, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.6899167671368104, + "learning_rate": 0.00019131232502286188, + "loss": 0.8429, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.61597299646533, + "learning_rate": 0.0001910997497957885, + "loss": 0.866, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5050759894276498, + "learning_rate": 0.00019088472623446183, + "loss": 0.9003, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.5553537290912124, + "learning_rate": 0.00019066726011770726, + "loss": 0.9935, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5709926920654692, + "learning_rate": 0.0001904473572899947, + "loss": 0.933, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.5749511557771967, + "learning_rate": 0.00019022502366128135, + "loss": 0.8781, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.574231287799876, + "learning_rate": 0.00019000026520685302, + "loss": 0.8559, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4575436155659373, + "learning_rate": 0.0001897730879671634, + "loss": 0.8057, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.512997079291813, + "learning_rate": 0.00018954349804767184, + "loss": 0.9096, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4543950317580882, + "learning_rate": 0.00018931150161867916, + "loss": 0.8099, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.7415531993835159, + "learning_rate": 0.00018907710491516199, + "loss": 0.92, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.5031015613723627, + "learning_rate": 0.0001888403142366049, + "loss": 0.8489, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5069331186376628, + "learning_rate": 0.00018860113594683148, + "loss": 0.8417, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5296095779215624, + "learning_rate": 0.00018835957647383303, + "loss": 0.8415, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.6174068211456757, + "learning_rate": 0.00018811564230959588, + "loss": 1.014, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.5038692627979767, + "learning_rate": 0.00018786934000992688, + "loss": 0.9044, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5174851061554335, + "learning_rate": 0.00018762067619427746, + "loss": 0.858, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5117319530131302, + "learning_rate": 0.00018736965754556528, + "loss": 0.9151, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.55146182969168, + "learning_rate": 0.00018711629080999504, + "loss": 0.9628, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.47125698940698507, + "learning_rate": 0.00018686058279687698, + "loss": 0.7792, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.48183861719102145, + "learning_rate": 0.00018660254037844388, + "loss": 0.81, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.7076884238374276, + "learning_rate": 0.00018634217048966637, + "loss": 0.9862, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5277239123951805, + "learning_rate": 0.0001860794801280666, + "loss": 0.8455, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5500019878771762, + "learning_rate": 0.0001858144763535302, + "loss": 0.9344, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.5681596854522988, + "learning_rate": 0.0001855471662881164, + "loss": 0.8738, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.537228959356412, + "learning_rate": 0.00018527755711586678, + "loss": 0.9068, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4068580570219105, + "learning_rate": 0.00018500565608261214, + "loss": 0.7808, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5709251278452853, + "learning_rate": 0.00018473147049577774, + "loss": 0.9829, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5220868006365708, + "learning_rate": 0.00018445500772418697, + "loss": 0.8227, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4291156529108893, + "learning_rate": 0.00018417627519786315, + "loss": 0.7303, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.5244275087026982, + "learning_rate": 0.00018389528040783012, + "loss": 0.8909, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5749167648537551, + "learning_rate": 0.00018361203090591071, + "loss": 0.9396, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5006517894046707, + "learning_rate": 0.00018332653430452376, + "loss": 0.8603, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.42912502006697223, + "learning_rate": 0.00018303879827647975, + "loss": 0.827, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.6563250170391516, + "learning_rate": 0.00018274883055477436, + "loss": 1.0645, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.5071156739289818, + "learning_rate": 0.00018245663893238075, + "loss": 0.843, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4443624151432539, + "learning_rate": 0.00018216223126204007, + "loss": 0.7973, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.5378395523138448, + "learning_rate": 0.00018186561545605054, + "loss": 0.8218, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4372134744896649, + "learning_rate": 0.00018156679948605467, + "loss": 0.8177, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.5704501228806174, + "learning_rate": 0.00018126579138282503, + "loss": 0.8947, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.5534021620896453, + "learning_rate": 0.0001809625992360485, + "loss": 0.9012, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5088507443455628, + "learning_rate": 0.00018065723119410884, + "loss": 0.8315, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.6305342270801653, + "learning_rate": 0.00018034969546386757, + "loss": 0.9733, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.5556030663549947, + "learning_rate": 0.0001800400003104436, + "loss": 0.8774, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.584957212445808, + "learning_rate": 0.00017972815405699103, + "loss": 0.8956, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.49116410611405603, + "learning_rate": 0.00017941416508447536, + "loss": 0.7761, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.6558956525178432, + "learning_rate": 0.0001790980418314484, + "loss": 1.0159, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.46229926694528667, + "learning_rate": 0.00017877979279382135, + "loss": 0.7958, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.533918210765701, + "learning_rate": 0.0001784594265246366, + "loss": 0.8272, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.5525524093954354, + "learning_rate": 0.0001781369516338378, + "loss": 0.8472, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.5268413236464946, + "learning_rate": 0.00017781237678803847, + "loss": 0.789, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.6034098838618137, + "learning_rate": 0.000177485710710289, + "loss": 0.768, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5123820139406852, + "learning_rate": 0.00017715696217984235, + "loss": 0.8368, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.5221297704321616, + "learning_rate": 0.00017682614003191807, + "loss": 0.876, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5443635828037163, + "learning_rate": 0.00017649325315746478, + "loss": 0.904, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.4127347153247367, + "learning_rate": 0.0001761583105029213, + "loss": 0.7049, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5058913241758101, + "learning_rate": 0.00017582132106997616, + "loss": 0.8628, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.5241538262739248, + "learning_rate": 0.00017548229391532572, + "loss": 0.8588, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4339969221990366, + "learning_rate": 0.00017514123815043074, + "loss": 0.8594, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.5410233290873173, + "learning_rate": 0.00017479816294127152, + "loss": 0.8184, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.5643039351700077, + "learning_rate": 0.0001744530775081015, + "loss": 0.8973, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5070570954794127, + "learning_rate": 0.0001741059911251997, + "loss": 0.9373, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.6431993631294437, + "learning_rate": 0.000173756913120621, + "loss": 0.8834, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5463897311592459, + "learning_rate": 0.00017340585287594604, + "loss": 0.8516, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5510992888253026, + "learning_rate": 0.0001730528198260285, + "loss": 0.9278, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.48576945131497623, + "learning_rate": 0.00017269782345874203, + "loss": 0.8189, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5455089581962679, + "learning_rate": 0.00017234087331472497, + "loss": 0.849, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4671115204530441, + "learning_rate": 0.00017198197898712404, + "loss": 0.8106, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.606453541968653, + "learning_rate": 0.00017162115012133643, + "loss": 0.8329, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5246409469178386, + "learning_rate": 0.00017125839641475072, + "loss": 0.8416, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.5325599880382337, + "learning_rate": 0.00017089372761648616, + "loss": 0.8032, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.586907938828374, + "learning_rate": 0.00017052715352713075, + "loss": 0.8808, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.6172095666111331, + "learning_rate": 0.00017015868399847768, + "loss": 0.8312, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.6019656760906752, + "learning_rate": 0.00016978832893326074, + "loss": 0.8424, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.49991172993633143, + "learning_rate": 0.00016941609828488807, + "loss": 0.8421, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.7448569391878513, + "learning_rate": 0.0001690420020571747, + "loss": 0.8937, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4519078160719926, + "learning_rate": 0.0001686660503040737, + "loss": 0.8147, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.5294837761405404, + "learning_rate": 0.00016828825312940592, + "loss": 0.8715, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4543029332406574, + "learning_rate": 0.0001679086206865886, + "loss": 0.8011, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.5030499995735945, + "learning_rate": 0.00016752716317836229, + "loss": 0.7599, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.5249044057354469, + "learning_rate": 0.0001671438908565167, + "loss": 0.7863, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.5367192267025025, + "learning_rate": 0.00016675881402161536, + "loss": 0.7586, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.6046994598165022, + "learning_rate": 0.0001663719430227186, + "loss": 0.8595, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.5511975892576534, + "learning_rate": 0.00016598328825710533, + "loss": 0.8884, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.47021844068648017, + "learning_rate": 0.000165592860169994, + "loss": 0.7914, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.5609037523683333, + "learning_rate": 0.00016520066925426144, + "loss": 0.8113, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.5278715536138033, + "learning_rate": 0.0001648067260501611, + "loss": 0.8919, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.47435584083177773, + "learning_rate": 0.0001644110411450398, + "loss": 0.7481, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.668966210084083, + "learning_rate": 0.00016401362517305296, + "loss": 0.9043, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4942687214975865, + "learning_rate": 0.00016361448881487914, + "loss": 0.7966, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.4764422141715473, + "learning_rate": 0.00016321364279743266, + "loss": 0.7993, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.6016523774814396, + "learning_rate": 0.0001628110978935756, + "loss": 0.9557, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4661173796072379, + "learning_rate": 0.00016240686492182804, + "loss": 0.8348, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.6051015964428406, + "learning_rate": 0.00016200095474607753, + "loss": 0.9355, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4578947855365344, + "learning_rate": 0.00016159337827528685, + "loss": 0.8189, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.4509433112679857, + "learning_rate": 0.0001611841464632011, + "loss": 0.8225, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.5235831584168316, + "learning_rate": 0.0001607732703080532, + "loss": 0.8879, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.44986301429924835, + "learning_rate": 0.00016036076085226814, + "loss": 0.8279, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5490905986071285, + "learning_rate": 0.0001599466291821666, + "loss": 0.8937, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.5282971310739915, + "learning_rate": 0.0001595308864276666, + "loss": 0.8387, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.5582842618729132, + "learning_rate": 0.0001591135437619847, + "loss": 0.8755, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4974797052185431, + "learning_rate": 0.0001586946124013354, + "loss": 0.702, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.6767520835439186, + "learning_rate": 0.0001582741036046301, + "loss": 0.9832, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.49096721822733147, + "learning_rate": 0.00015785202867317407, + "loss": 0.834, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4336794454311267, + "learning_rate": 0.00015742839895036305, + "loss": 0.7807, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.561256529665769, + "learning_rate": 0.00015700322582137827, + "loss": 0.8304, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5346326077710617, + "learning_rate": 0.0001565765207128805, + "loss": 0.8853, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.5575705612708424, + "learning_rate": 0.0001561482950927029, + "loss": 0.845, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.6113035100414476, + "learning_rate": 0.00015571856046954285, + "loss": 0.8369, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5531036051454451, + "learning_rate": 0.00015528732839265272, + "loss": 0.8495, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.5191241536194888, + "learning_rate": 0.0001548546104515294, + "loss": 0.8543, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.6609229189584678, + "learning_rate": 0.00015442041827560274, + "loss": 0.9303, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.523613656193414, + "learning_rate": 0.00015398476353392323, + "loss": 0.8125, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.6000257986217872, + "learning_rate": 0.00015354765793484834, + "loss": 0.9602, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.5284405089059923, + "learning_rate": 0.00015310911322572753, + "loss": 0.8679, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.784321921220287, + "learning_rate": 0.000152669141192587, + "loss": 1.0552, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5348576999816136, + "learning_rate": 0.00015222775365981273, + "loss": 0.8425, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.7391155392801806, + "learning_rate": 0.00015178496248983254, + "loss": 0.9355, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.46575258380512435, + "learning_rate": 0.00015134077958279765, + "loss": 0.76, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.48666315272804594, + "learning_rate": 0.00015089521687626243, + "loss": 0.8717, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.565648946468228, + "learning_rate": 0.000150448286344864, + "loss": 0.8518, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.5309309049154313, + "learning_rate": 0.00015000000000000001, + "loss": 0.8486, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4684232580120976, + "learning_rate": 0.00014955036988950618, + "loss": 0.8592, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4764313022388415, + "learning_rate": 0.00014909940809733222, + "loss": 0.8644, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.8136632652056193, + "learning_rate": 0.00014864712674321734, + "loss": 0.9112, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.6438521631504122, + "learning_rate": 0.00014819353798236427, + "loss": 0.8526, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.5102341931924466, + "learning_rate": 0.00014773865400511272, + "loss": 0.7808, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.5940550280405117, + "learning_rate": 0.00014728248703661182, + "loss": 0.8275, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.623552086471691, + "learning_rate": 0.00014682504933649144, + "loss": 0.9372, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5378567905788513, + "learning_rate": 0.00014636635319853275, + "loss": 0.8185, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.4925727020488738, + "learning_rate": 0.00014590641095033787, + "loss": 0.8259, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.6315840232859294, + "learning_rate": 0.00014544523495299842, + "loss": 0.8099, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5932994881382887, + "learning_rate": 0.0001449828376007636, + "loss": 0.876, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.6004134023890813, + "learning_rate": 0.0001445192313207067, + "loss": 0.869, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5748874245676628, + "learning_rate": 0.0001440544285723915, + "loss": 0.8937, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.4777508163658265, + "learning_rate": 0.00014358844184753712, + "loss": 0.84, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5437019804171094, + "learning_rate": 0.00014312128366968243, + "loss": 0.8783, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.6389138378535143, + "learning_rate": 0.00014265296659384956, + "loss": 0.8807, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4882309118075003, + "learning_rate": 0.00014218350320620624, + "loss": 0.8168, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5168071744262905, + "learning_rate": 0.0001417129061237278, + "loss": 0.8979, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.49827826762164956, + "learning_rate": 0.00014124118799385796, + "loss": 0.8399, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.5702744992964714, + "learning_rate": 0.00014076836149416887, + "loss": 0.8623, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5576641561434929, + "learning_rate": 0.0001402944393320206, + "loss": 0.8827, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.46111048315752434, + "learning_rate": 0.00013981943424421932, + "loss": 0.7337, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.6104351142056681, + "learning_rate": 0.00013934335899667527, + "loss": 0.7917, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.6977294844697014, + "learning_rate": 0.00013886622638405952, + "loss": 0.9764, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.5464742075267145, + "learning_rate": 0.00013838804922946027, + "loss": 0.8489, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4408710931040891, + "learning_rate": 0.00013790884038403795, + "loss": 0.7347, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4550331663208126, + "learning_rate": 0.00013742861272668012, + "loss": 0.8172, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.47431682121182805, + "learning_rate": 0.00013694737916365517, + "loss": 0.755, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.6035254807715448, + "learning_rate": 0.00013646515262826552, + "loss": 0.9423, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3751226324854456, + "learning_rate": 0.0001359819460805001, + "loss": 0.6642, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.45736782627361144, + "learning_rate": 0.0001354977725066859, + "loss": 0.7436, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.5145611235864497, + "learning_rate": 0.00013501264491913906, + "loss": 0.8351, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.9386687957598261, + "learning_rate": 0.0001345265763558152, + "loss": 0.8669, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.5862045163334885, + "learning_rate": 0.00013403957987995882, + "loss": 0.9092, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.7037646177150105, + "learning_rate": 0.0001335516685797525, + "loss": 0.8805, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.8613306434547313, + "learning_rate": 0.00013306285556796495, + "loss": 0.9224, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.43063135054561263, + "learning_rate": 0.00013257315398159864, + "loss": 0.7473, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.4726821562111012, + "learning_rate": 0.00013208257698153677, + "loss": 0.7395, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.5654880939742353, + "learning_rate": 0.00013159113775218964, + "loss": 0.7463, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.532729889223182, + "learning_rate": 0.00013109884950114007, + "loss": 0.8258, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4296463182758616, + "learning_rate": 0.00013060572545878875, + "loss": 0.773, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.3947029204695878, + "learning_rate": 0.00013011177887799845, + "loss": 0.7224, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.46957175369507925, + "learning_rate": 0.00012961702303373795, + "loss": 0.8126, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.5067396710596804, + "learning_rate": 0.00012912147122272523, + "loss": 0.8827, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4153569581713736, + "learning_rate": 0.00012862513676307008, + "loss": 0.7829, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.5163079913910601, + "learning_rate": 0.00012812803299391628, + "loss": 0.7452, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.47426234833369857, + "learning_rate": 0.00012763017327508305, + "loss": 0.7982, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.5988256168061469, + "learning_rate": 0.0001271315709867059, + "loss": 0.9765, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.4728667413328611, + "learning_rate": 0.00012663223952887723, + "loss": 0.7693, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4831701154962461, + "learning_rate": 0.00012613219232128608, + "loss": 0.8936, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.6182426306462532, + "learning_rate": 0.00012563144280285741, + "loss": 0.9779, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.5004947438392414, + "learning_rate": 0.00012513000443139112, + "loss": 0.8361, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4575584899019911, + "learning_rate": 0.00012462789068320017, + "loss": 0.7898, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.5593064170621754, + "learning_rate": 0.00012412511505274844, + "loss": 0.78, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.48521475508879675, + "learning_rate": 0.00012362169105228826, + "loss": 0.8706, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5383484682602684, + "learning_rate": 0.000123117632211497, + "loss": 0.7173, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.46203346435210657, + "learning_rate": 0.00012261295207711346, + "loss": 0.7481, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.6076992808818641, + "learning_rate": 0.0001221076642125742, + "loss": 0.8122, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.5864862009753786, + "learning_rate": 0.00012160178219764837, + "loss": 0.9199, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.5008180806943826, + "learning_rate": 0.00012109531962807332, + "loss": 0.739, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.46245114842878776, + "learning_rate": 0.00012058829011518896, + "loss": 0.7855, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.5635026031048164, + "learning_rate": 0.00012008070728557186, + "loss": 0.8507, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.633749384863932, + "learning_rate": 0.00011957258478066931, + "loss": 0.7524, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.48959796070667255, + "learning_rate": 0.00011906393625643244, + "loss": 0.8805, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.599644675537256, + "learning_rate": 0.00011855477538294935, + "loss": 0.9511, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.39557107991372126, + "learning_rate": 0.00011804511584407763, + "loss": 0.6824, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5114523104146805, + "learning_rate": 0.00011753497133707679, + "loss": 0.8748, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4603270033213667, + "learning_rate": 0.00011702435557223987, + "loss": 0.7276, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.4850670420577096, + "learning_rate": 0.00011651328227252517, + "loss": 0.7544, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4615560420158054, + "learning_rate": 0.00011600176517318741, + "loss": 0.8094, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.5199828460695506, + "learning_rate": 0.00011548981802140848, + "loss": 0.8646, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4706257553077559, + "learning_rate": 0.00011497745457592816, + "loss": 0.8289, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.41943877343889957, + "learning_rate": 0.00011446468860667421, + "loss": 0.7521, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.4716217258820339, + "learning_rate": 0.00011395153389439233, + "loss": 0.7627, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6786834378679842, + "learning_rate": 0.00011343800423027582, + "loss": 0.8907, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.43217816337126086, + "learning_rate": 0.0001129241134155949, + "loss": 0.7732, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.46949227376853764, + "learning_rate": 0.00011240987526132594, + "loss": 0.9017, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4661775390489814, + "learning_rate": 0.00011189530358778005, + "loss": 0.714, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.46140392359972254, + "learning_rate": 0.00011138041222423177, + "loss": 0.7431, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.4550459454163621, + "learning_rate": 0.00011086521500854745, + "loss": 0.8215, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4415086849559902, + "learning_rate": 0.00011034972578681338, + "loss": 0.8797, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.598195900084871, + "learning_rate": 0.00010983395841296348, + "loss": 0.8701, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4465807702003201, + "learning_rate": 0.00010931792674840718, + "loss": 0.7488, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.6155362524306691, + "learning_rate": 0.00010880164466165674, + "loss": 0.8282, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4180641928461069, + "learning_rate": 0.00010828512602795462, + "loss": 0.6957, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.5513503369328582, + "learning_rate": 0.00010776838472890065, + "loss": 0.7914, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4842650496159258, + "learning_rate": 0.00010725143465207867, + "loss": 0.7634, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5263091081666915, + "learning_rate": 0.00010673428969068364, + "loss": 0.8129, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.5938506329556743, + "learning_rate": 0.00010621696374314807, + "loss": 0.9201, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.551330984952594, + "learning_rate": 0.00010569947071276847, + "loss": 0.8799, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.7601199557677949, + "learning_rate": 0.00010518182450733186, + "loss": 0.9105, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4303591705924048, + "learning_rate": 0.00010466403903874176, + "loss": 0.7641, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5397792424413276, + "learning_rate": 0.00010414612822264455, + "loss": 0.9404, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.4887982118564506, + "learning_rate": 0.00010362810597805526, + "loss": 0.8053, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4673734981798477, + "learning_rate": 0.0001031099862269837, + "loss": 0.7912, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4851329513640223, + "learning_rate": 0.00010259178289406011, + "loss": 0.7895, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4348341884313913, + "learning_rate": 0.00010207350990616107, + "loss": 0.7757, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.5619573417540332, + "learning_rate": 0.0001015551811920351, + "loss": 0.8219, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.4448837026067695, + "learning_rate": 0.00010103681068192845, + "loss": 0.8214, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4412528222008381, + "learning_rate": 0.00010051841230721065, + "loss": 0.8126, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.49795418352279724, + "learning_rate": 0.0001, + "loss": 0.8511, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.6646338270787656, + "learning_rate": 9.948158769278939e-05, + "loss": 0.9252, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.6047251048973974, + "learning_rate": 9.896318931807155e-05, + "loss": 0.8856, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.4940920844742925, + "learning_rate": 9.844481880796491e-05, + "loss": 0.8153, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6814053429906266, + "learning_rate": 9.792649009383899e-05, + "loss": 0.8841, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.433215558732901, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6603, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.44957536906911494, + "learning_rate": 9.689001377301633e-05, + "loss": 0.716, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4914217112657972, + "learning_rate": 9.637189402194476e-05, + "loss": 0.8497, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.4750281054941176, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7759, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.4605094039390799, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7512, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5301471784253153, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7561, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.5156702413525251, + "learning_rate": 9.430052928723153e-05, + "loss": 0.8423, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.44662040635807865, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7822, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.4208428464465991, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7097, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4355712137687321, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7986, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.4928953933111074, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7569, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5224733963670573, + "learning_rate": 9.171487397204539e-05, + "loss": 0.8273, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.5771379087578413, + "learning_rate": 9.119835533834331e-05, + "loss": 0.8248, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.48450806448263845, + "learning_rate": 9.068207325159284e-05, + "loss": 0.8014, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.481103005933363, + "learning_rate": 9.016604158703654e-05, + "loss": 0.8022, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5538219740275729, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7581, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4418676812397226, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7544, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.5050698181061205, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7913, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.4767268572887257, + "learning_rate": 8.810469641222001e-05, + "loss": 0.8206, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4146839368127928, + "learning_rate": 8.759012473867407e-05, + "loss": 0.8135, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.49714856909001787, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7661, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4645738096617371, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7932, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.5418003671606643, + "learning_rate": 8.604846610560771e-05, + "loss": 0.8437, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.49298359240777184, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7813, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.7609953729698271, + "learning_rate": 8.502254542407186e-05, + "loss": 0.8185, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.44183019732999196, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7714, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.4088592388689686, + "learning_rate": 8.399823482681262e-05, + "loss": 0.7494, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.40975797453002993, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7723, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.500990734539649, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7737, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.8763033227557493, + "learning_rate": 8.246502866292324e-05, + "loss": 0.8078, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4098708175269476, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7038, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.6136809658421051, + "learning_rate": 8.144522461705067e-05, + "loss": 0.8555, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.6039742316669718, + "learning_rate": 8.093606374356759e-05, + "loss": 0.8918, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.5946975858000318, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7445, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.5243833434113363, + "learning_rate": 7.991929271442817e-05, + "loss": 0.8365, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.696009625451111, + "learning_rate": 7.941170988481108e-05, + "loss": 0.8724, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.6203198423638049, + "learning_rate": 7.89046803719267e-05, + "loss": 0.8991, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.431515175621456, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6857, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.519278586203387, + "learning_rate": 7.789233578742582e-05, + "loss": 0.784, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.49743930237280587, + "learning_rate": 7.738704792288655e-05, + "loss": 0.8482, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.5849132524626789, + "learning_rate": 7.688236778850306e-05, + "loss": 0.8286, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.46552588262598193, + "learning_rate": 7.637830894771175e-05, + "loss": 0.856, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4482646417650763, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7512, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.42486744603141685, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6975, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.43976643777741153, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7258, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.6272719537569044, + "learning_rate": 7.43685571971426e-05, + "loss": 0.899, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.3986327415141386, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7104, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4955115549656417, + "learning_rate": 7.336776047112276e-05, + "loss": 0.8892, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.49512595703273493, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7756, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5905507667064706, + "learning_rate": 7.236982672491698e-05, + "loss": 0.8908, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.44001761796947125, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7847, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.484668167723542, + "learning_rate": 7.137486323692995e-05, + "loss": 0.8202, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5061493652802893, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7659, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.459908723396548, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7429, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.5311269649573576, + "learning_rate": 6.988822112200156e-05, + "loss": 0.8575, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4272449890215387, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7761, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.47308670849243734, + "learning_rate": 6.890115049885994e-05, + "loss": 0.8051, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.47459530007763184, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6891, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.42918346769674953, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7558, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.4843615427233137, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7177, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.3931012009976337, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6856, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.6674786874769335, + "learning_rate": 6.644833142024751e-05, + "loss": 0.9333, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.403434388808353, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6809, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.42865534645955083, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7807, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.5531121427660198, + "learning_rate": 6.498735508086093e-05, + "loss": 0.8009, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.40286712803632035, + "learning_rate": 6.450222749331414e-05, + "loss": 0.836, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4687974343547248, + "learning_rate": 6.40180539194999e-05, + "loss": 0.8253, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.47197844007080497, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7944, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.486408237962315, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7509, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.5869738610437195, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7953, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.43457878847863257, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6897, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.5764860701204353, + "learning_rate": 6.161195077053976e-05, + "loss": 0.9262, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4151703618751473, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7574, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.46799090298160206, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7309, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4767199781872603, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7432, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5338210338772973, + "learning_rate": 5.970556066797941e-05, + "loss": 0.8245, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.44315164336724483, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7933, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.5232242045280784, + "learning_rate": 5.875881200614207e-05, + "loss": 0.8696, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.3761336773443223, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7843, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.49015333408077183, + "learning_rate": 5.781649679379378e-05, + "loss": 0.8156, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.4949869156738355, + "learning_rate": 5.73470334061505e-05, + "loss": 0.8448, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5513926873484194, + "learning_rate": 5.687871633031754e-05, + "loss": 0.944, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5076373807708922, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.8312, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.5221652087488239, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.8048, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.7909223310975126, + "learning_rate": 5.54807686792933e-05, + "loss": 0.9155, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5050567694945293, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7902, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4933778254758736, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.8708, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.6085020099507188, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.9631, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.48881092432081136, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7942, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.5328506602391134, + "learning_rate": 5.31749506635086e-05, + "loss": 0.8172, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.49480227356004286, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7833, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.48542832605280173, + "learning_rate": 5.226134599488728e-05, + "loss": 0.8107, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.47554761567105164, + "learning_rate": 5.180646201763577e-05, + "loss": 0.712, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.5865668935576726, + "learning_rate": 5.135287325678271e-05, + "loss": 0.9088, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.41550042607241994, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7112, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.40232320456189885, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7461, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.43558339499848475, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6831, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.44795635239543685, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7535, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.5688062900456592, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7766, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5411100645268582, + "learning_rate": 4.865922041720239e-05, + "loss": 0.8379, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.785173955175749, + "learning_rate": 4.821503751016746e-05, + "loss": 0.8736, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5694372419439617, + "learning_rate": 4.777224634018732e-05, + "loss": 0.8607, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.44414458225125525, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7479, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.38581058519584893, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6847, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.5764557383958909, + "learning_rate": 4.645234206515171e-05, + "loss": 0.8552, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.44431394707674726, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7253, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.46415861733429314, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.8071, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4040067644632014, + "learning_rate": 4.514538954847064e-05, + "loss": 0.651, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.47156609100041014, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7547, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4751331987703744, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7702, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.5570235075279028, + "learning_rate": 4.385170490729712e-05, + "loss": 0.8715, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4242586984566103, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7714, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4285280557058061, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.7948, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.4644510958600794, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7333, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.4521746592306217, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7224, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.5913011471811631, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7531, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.453332627125057, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7774, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5313908009606292, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7977, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.44955174895943734, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7746, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.6126618976150218, + "learning_rate": 4.00533708178334e-05, + "loss": 0.8736, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5822202712721202, + "learning_rate": 3.963923914773187e-05, + "loss": 0.9148, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4986228890404806, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7716, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.4007906896378964, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6834, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.5493506440801259, + "learning_rate": 3.840662172471315e-05, + "loss": 0.8659, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.49276563271224727, + "learning_rate": 3.79990452539225e-05, + "loss": 0.7534, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.48320789627580496, + "learning_rate": 3.759313507817196e-05, + "loss": 0.8172, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.41936074078974206, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7933, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.5372564975322116, + "learning_rate": 3.678635720256737e-05, + "loss": 0.8315, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.5801661809847876, + "learning_rate": 3.638551118512089e-05, + "loss": 0.7001, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4837845296948569, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.8001, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.7266773227848353, + "learning_rate": 3.558895885496023e-05, + "loss": 0.94, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.550336617198042, + "learning_rate": 3.519327394983888e-05, + "loss": 0.8747, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.5502687804150279, + "learning_rate": 3.479933074573858e-05, + "loss": 0.8737, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.3770404511005428, + "learning_rate": 3.440713983000601e-05, + "loss": 0.658, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.5400732162785093, + "learning_rate": 3.401671174289469e-05, + "loss": 0.8436, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5625985678957215, + "learning_rate": 3.362805697728145e-05, + "loss": 0.8116, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.4990853595000655, + "learning_rate": 3.324118597838464e-05, + "loss": 0.8323, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5413158096901148, + "learning_rate": 3.285610914348332e-05, + "loss": 0.8623, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.44364974048301387, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7717, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4818165245901257, + "learning_rate": 3.209137931341143e-05, + "loss": 0.8421, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.48078247572273736, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7534, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.7066838487989124, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.8655, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.5251245254632664, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.8004, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.5895314067131298, + "learning_rate": 3.058390171511196e-05, + "loss": 0.7863, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.42127664447603985, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6784, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4691636152593432, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.8385, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4501354628353762, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.7619, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.415124794643123, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7181, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.5627594769488438, + "learning_rate": 2.874160358524931e-05, + "loss": 0.9042, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.44490947481959, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6357, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.5817279105746219, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.8829, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.6587378305549441, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.8005, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.42424532031828477, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7305, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.40489764871202116, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.7282, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5056254245102135, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6816, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.44314862665401866, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7087, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.4988712835436097, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7699, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.5194393424400168, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7883, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.5655293463573838, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.8512, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4573170674666667, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7258, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.4717436563473585, + "learning_rate": 2.451770608467432e-05, + "loss": 0.7664, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.7600668417367207, + "learning_rate": 2.417867893002387e-05, + "loss": 0.9817, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.454285204781225, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6792, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.5134769941764181, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7161, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.41445927327481324, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7334, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4469742005032891, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.7311, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.4412576821600429, + "learning_rate": 2.251428928971102e-05, + "loss": 0.7303, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.46287664763589803, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.816, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.45669162759999254, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7279, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.36611990224798674, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.7352, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.5656992758486998, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.7765, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.6842431356491339, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.8718, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4800259822183707, + "learning_rate": 2.058583491552465e-05, + "loss": 0.8331, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.4744162443639759, + "learning_rate": 2.027184594300898e-05, + "loss": 0.7533, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.529271868197914, + "learning_rate": 1.995999968955641e-05, + "loss": 0.9258, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5646985855066731, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.8823, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4240601302452028, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7133, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4573956395152056, + "learning_rate": 1.903740076395151e-05, + "loss": 0.7949, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.4769725950279453, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7622, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.5510634691154779, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.8081, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.5682105476060857, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.917, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.5277516399630586, + "learning_rate": 1.783776873795994e-05, + "loss": 0.8857, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.7500931605600588, + "learning_rate": 1.754336106761927e-05, + "loss": 0.9307, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.5040665929325822, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.841, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.39455094792522016, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6728, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.5572000427210648, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.7578, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.5187875614390844, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.774, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4405333764702291, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.7851, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.6529238235867248, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7798, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.4882094578728378, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7813, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.47297380654401905, + "learning_rate": 1.526852950422226e-05, + "loss": 0.783, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.41791290137820514, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7383, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.614821635899501, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.8027, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.5629745897752465, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.773, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.6034340485024299, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.9058, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.45977861182601576, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.8149, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.50221813379731, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.736, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.5073291923422781, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7513, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.5072707035921479, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.7421, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.46582944938080906, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.717, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3939880605472184, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7486, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.41842556533290937, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7163, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.45976787759587967, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.666, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.5741122959132184, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7962, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.5574064273608463, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.785, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5863087560710555, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.819, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.5600281242293509, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.8736, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.461928830964332, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6682, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.4762813675960855, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.823, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.580988891335986, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7691, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4721345192605899, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7923, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4660525584882778, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7715, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.532626625364438, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6877, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.5857105608963418, + "learning_rate": 9.552642710005299e-06, + "loss": 0.8983, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.46271941466564903, + "learning_rate": 9.332739882292752e-06, + "loss": 0.714, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.4497277199017365, + "learning_rate": 9.115273765538202e-06, + "loss": 0.7407, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.5850063104492209, + "learning_rate": 8.900250204211514e-06, + "loss": 0.7714, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5618659006849629, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7518, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.511506487763565, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7431, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.450060820779741, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6743, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.40282916698031773, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6709, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.41181490509250535, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6955, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.5045281883413828, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7732, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.5191836427394763, + "learning_rate": 7.463953938275858e-06, + "loss": 0.8452, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.428564300937728, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.717, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.5153289123449012, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6808, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4616447424916075, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.7531, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.39681321635925004, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7446, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.7129929489595015, + "learning_rate": 6.512524116523633e-06, + "loss": 0.8922, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.44793949504325403, + "learning_rate": 6.329755547632499e-06, + "loss": 0.7386, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.5872252829296346, + "learning_rate": 6.149504395842087e-06, + "loss": 0.8107, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.41757767107027727, + "learning_rate": 5.971775505458444e-06, + "loss": 0.7552, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5278586013338012, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.8128, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.4764369269232915, + "learning_rate": 5.623903547074549e-06, + "loss": 0.8243, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.445195581809496, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6867, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.49976681029923836, + "learning_rate": 5.286177068899989e-06, + "loss": 0.7059, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4124258078264793, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7264, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4499532492200171, + "learning_rate": 4.95863237670956e-06, + "loss": 0.79, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.47209635231334507, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6334, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4856682554711025, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7553, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.5662153645220434, + "learning_rate": 4.486482911479839e-06, + "loss": 0.8279, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4090240938346095, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.7344, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.537789948994782, + "learning_rate": 4.184544329761009e-06, + "loss": 0.828, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.4332640950791847, + "learning_rate": 4.037435632986786e-06, + "loss": 0.8001, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.43843217972504145, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7406, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.445514088395683, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7595, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4275969003141511, + "learning_rate": 3.611599153858214e-06, + "loss": 0.7319, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.6181916419087072, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.8866, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.4964113303224607, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7641, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5931750787393189, + "learning_rate": 3.209076472645112e-06, + "loss": 0.75, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.4920843039984812, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.7729, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.37980050845658425, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6829, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3963894276679346, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6427, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.4154526430122979, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6851, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.5199993448362794, + "learning_rate": 2.590275647868867e-06, + "loss": 0.7961, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5750088863420132, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.8081, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.5140225691311261, + "learning_rate": 2.3610579436393e-06, + "loss": 0.8013, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.49898354854720056, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7381, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.5249545174252546, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.729, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4401588184940419, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7629, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4189629890238096, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7666, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.5006645773204051, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6968, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.5632707912819288, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.8454, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.6024324926308866, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.8279, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.582047873817297, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.7981, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4564155317343173, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7074, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.6058058693949488, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.8158, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4181397209604915, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7312, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.4614300144965895, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.7338, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4846363832282807, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.8036, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4609145529752502, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6841, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5076128767123109, + "learning_rate": 9.780089980330642e-07, + "loss": 0.8411, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.485767970970762, + "learning_rate": 9.070131527609604e-07, + "loss": 0.761, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.5029976589432248, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7573, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.46078550545735925, + "learning_rate": 7.730127636723539e-07, + "loss": 0.8573, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5125683254773828, + "learning_rate": 7.100118211581852e-07, + "loss": 0.8219, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.7099555205627927, + "learning_rate": 6.496793281141056e-07, + "loss": 0.8329, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.5257512987120833, + "learning_rate": 5.920169059947411e-07, + "loss": 0.7691, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.44955284490495473, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6724, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.43068643981099336, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7384, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.7404433663435687, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.8677, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.5401203008083829, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.727, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.5506770140027683, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.8123, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.5093788680691201, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.83, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.6168736957077788, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.9002, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4196641464596395, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.77, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4784595580207899, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.7212, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.48297015578133545, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.754, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.3956503814631328, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.7458, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.5010590810156292, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.8102, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.4906446661999585, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7993, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.517825027389168, + "learning_rate": 6.583743778106887e-08, + "loss": 0.8563, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.4786367198289518, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7457, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.48317436212573805, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7838, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.41016938444393536, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6589, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5792843604463185, + "learning_rate": 1.209367398504746e-08, + "loss": 0.8008, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4562185264492372, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7481, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.5315806159191756, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.8277, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.4996324184516367, + "learning_rate": 0.0, + "loss": 0.7711, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 507159541022720.0, + "train_loss": 0.8328443063735962, + "train_runtime": 9270.2391, + "train_samples_per_second": 1.079, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 507159541022720.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..04695a99b4d17a66e0122de85a215f376b3dc301 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "k_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7ef969b1572f1f08788458dca64baed71cf5ca1b --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97c5d7de491d336dcc2b1c913eec57bc2ade51a9e0214178292b044b106f4bf +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..71c5456d9b80fcede1abb6e90ce59e959383fae8 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:510b7354cbdb44580b8fcff4c0cba0358329aad7b5f7e48dcb45928b558fe28f +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..03527f65109970f26c67a8a5de13dcee2ea45072 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.8210982159609682, + "learning_rate": 2e-05, + "loss": 1.3329, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7751787218518346, + "learning_rate": 4e-05, + "loss": 1.3721, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6646539221769208, + "learning_rate": 6e-05, + "loss": 1.3088, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7036195246304789, + "learning_rate": 8e-05, + "loss": 1.3377, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.6168233543536914, + "learning_rate": 0.0001, + "loss": 1.1919, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.8492868526153039, + "learning_rate": 0.00012, + "loss": 1.2512, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7936611575617178, + "learning_rate": 0.00014, + "loss": 1.0697, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5862560941923312, + "learning_rate": 0.00016, + "loss": 1.0911, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.49328071163938675, + "learning_rate": 0.00018, + "loss": 0.9762, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.4796953108328627, + "learning_rate": 0.0002, + "loss": 0.9269, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.47431010608538954, + "learning_rate": 0.00019999458931878073, + "loss": 1.032, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4707062597281157, + "learning_rate": 0.0001999783578606323, + "loss": 0.9748, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4880961566615289, + "learning_rate": 0.00019995130738201966, + "loss": 0.9767, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4658440438656818, + "learning_rate": 0.0001999134408101731, + "loss": 0.9179, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.5344941086238363, + "learning_rate": 0.00019986476224277165, + "loss": 1.0129, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5007192463357719, + "learning_rate": 0.00019980527694749952, + "loss": 0.9953, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.44861347739015633, + "learning_rate": 0.00019973499136147606, + "loss": 0.8987, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.40432515099975497, + "learning_rate": 0.0001996539130905593, + "loss": 0.9949, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.42728242685352247, + "learning_rate": 0.0001995620509085228, + "loss": 0.9649, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.4641229091681979, + "learning_rate": 0.00019945941475610623, + "loss": 0.9609, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.3913009193318844, + "learning_rate": 0.0001993460157399396, + "loss": 0.8997, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3960221279711468, + "learning_rate": 0.0001992218661313415, + "loss": 0.9046, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4893549306818581, + "learning_rate": 0.00019908697936499103, + "loss": 0.9538, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.3943436800759236, + "learning_rate": 0.00019894137003747403, + "loss": 0.85, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.49281687959276665, + "learning_rate": 0.00019878505390570362, + "loss": 0.8904, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.45164645594942804, + "learning_rate": 0.00019861804788521493, + "loss": 0.9793, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.3884822532677797, + "learning_rate": 0.00019844037004833473, + "loss": 0.8047, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.41101776336511836, + "learning_rate": 0.00019825203962222572, + "loss": 0.9077, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 1.3086028478450786, + "learning_rate": 0.0001980530769868059, + "loss": 0.9363, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.3375555003941169, + "learning_rate": 0.00019784350367254322, + "loss": 0.8523, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4356929720245981, + "learning_rate": 0.0001976233423581255, + "loss": 0.9504, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4558256010915256, + "learning_rate": 0.0001973926168680066, + "loss": 0.9619, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4182066475020067, + "learning_rate": 0.00019715135216982798, + "loss": 0.853, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.42581671871982196, + "learning_rate": 0.0001968995743717171, + "loss": 0.9528, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.39591765873363666, + "learning_rate": 0.00019663731071946206, + "loss": 0.9005, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.39119354380322896, + "learning_rate": 0.00019636458959356316, + "loss": 0.8988, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.40090914965423335, + "learning_rate": 0.0001960814405061619, + "loss": 0.8758, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3945361491838484, + "learning_rate": 0.00019578789409784727, + "loss": 0.9393, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4394341724704068, + "learning_rate": 0.00019548398213434007, + "loss": 0.9428, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.3727456121947463, + "learning_rate": 0.00019516973750305532, + "loss": 0.8748, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.37234199586270694, + "learning_rate": 0.00019484519420954354, + "loss": 0.8736, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.39842513634466276, + "learning_rate": 0.00019451038737381077, + "loss": 0.9022, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.38128602501461806, + "learning_rate": 0.00019416535322651818, + "loss": 0.8208, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.38025302062603994, + "learning_rate": 0.00019381012910506146, + "loss": 0.8832, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.34279246176384087, + "learning_rate": 0.00019344475344953012, + "loss": 0.79, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.39891015144038994, + "learning_rate": 0.00019306926579854821, + "loss": 0.8637, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.33599563185761405, + "learning_rate": 0.00019268370678499533, + "loss": 0.7904, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.405639307195739, + "learning_rate": 0.0001922881181316097, + "loss": 0.8904, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.347008173492179, + "learning_rate": 0.00019188254264647337, + "loss": 0.8188, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.4053373760955314, + "learning_rate": 0.0001914670242183795, + "loss": 0.8767, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4071076068532092, + "learning_rate": 0.0001910416078120832, + "loss": 0.8753, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3947573279192353, + "learning_rate": 0.0001906063394634356, + "loss": 0.9461, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.41593394777293125, + "learning_rate": 0.00019016126627440237, + "loss": 0.8543, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.34556265770047007, + "learning_rate": 0.00018970643640796642, + "loss": 0.8457, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.4135992491712443, + "learning_rate": 0.000189241899082916, + "loss": 0.8471, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.36656186910259925, + "learning_rate": 0.00018876770456851877, + "loss": 0.8315, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.41888747530592474, + "learning_rate": 0.0001882839041790818, + "loss": 0.9118, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.36251863953206737, + "learning_rate": 0.00018779055026839868, + "loss": 0.8626, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3748079918860433, + "learning_rate": 0.00018728769622408423, + "loss": 0.9207, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.34699582513061866, + "learning_rate": 0.00018677539646179707, + "loss": 0.7823, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4324436996517203, + "learning_rate": 0.00018625370641935129, + "loss": 0.9012, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.40830183945729825, + "learning_rate": 0.00018572268255071718, + "loss": 0.8866, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.35645475792133624, + "learning_rate": 0.00018518238231991218, + "loss": 0.8246, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.39501413519318324, + "learning_rate": 0.00018463286419478255, + "loss": 0.8903, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.3403250694314037, + "learning_rate": 0.00018407418764067627, + "loss": 0.8002, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.38578492015884985, + "learning_rate": 0.00018350641311400812, + "loss": 0.8823, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.42912996214481597, + "learning_rate": 0.0001829296020557174, + "loss": 0.9337, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.33840309416398107, + "learning_rate": 0.00018234381688461942, + "loss": 0.8152, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.36261675945692257, + "learning_rate": 0.0001817491209906506, + "loss": 0.8119, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.3978630764800173, + "learning_rate": 0.00018114557872800905, + "loss": 0.8884, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4067690308983075, + "learning_rate": 0.00018053325540819045, + "loss": 0.888, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.39761946908218565, + "learning_rate": 0.0001799122172929206, + "loss": 0.8734, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.416514669804446, + "learning_rate": 0.00017928253158698473, + "loss": 0.8813, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3831820092966067, + "learning_rate": 0.0001786442664309554, + "loss": 0.8015, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.40531349166429315, + "learning_rate": 0.0001779974908938184, + "loss": 0.8021, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3757725806826467, + "learning_rate": 0.0001773422749654988, + "loss": 0.7819, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3973182683957758, + "learning_rate": 0.00017667868954928694, + "loss": 0.8807, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3251584668348043, + "learning_rate": 0.00017600680645416583, + "loss": 0.7627, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.41127123846488717, + "learning_rate": 0.00017532669838704035, + "loss": 0.8387, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.37661663005517965, + "learning_rate": 0.00017463843894486937, + "loss": 0.8423, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5786386678521719, + "learning_rate": 0.0001739421026067017, + "loss": 0.8949, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.362252109033277, + "learning_rate": 0.00017323776472561627, + "loss": 0.8644, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3786074251460458, + "learning_rate": 0.00017252550152056795, + "loss": 0.8161, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.38704773419959004, + "learning_rate": 0.0001718053900681397, + "loss": 0.8054, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.36687924172159914, + "learning_rate": 0.00017107750829420176, + "loss": 0.8053, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4532905914552837, + "learning_rate": 0.00017034193496547902, + "loss": 0.8496, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.3868107415971005, + "learning_rate": 0.00016959874968102735, + "loss": 0.8292, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.45004745878752395, + "learning_rate": 0.00016884803286362, + "loss": 0.8386, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3601950439013075, + "learning_rate": 0.00016808986575104465, + "loss": 0.8144, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.39414164051104805, + "learning_rate": 0.00016732433038731242, + "loss": 0.7563, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4162252396854745, + "learning_rate": 0.0001665515096137797, + "loss": 0.79, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.37960017989400824, + "learning_rate": 0.00016577148706018328, + "loss": 0.8201, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3838649671026415, + "learning_rate": 0.00016498434713559088, + "loss": 0.8362, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.43789279418131016, + "learning_rate": 0.00016419017501926656, + "loss": 0.8057, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.33824523290379066, + "learning_rate": 0.0001633890566514535, + "loss": 0.7832, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.38508464229871187, + "learning_rate": 0.00016258107872407375, + "loss": 0.8825, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3872280278580832, + "learning_rate": 0.0001617663286713474, + "loss": 0.8568, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3710845404668928, + "learning_rate": 0.00016094489466033043, + "loss": 0.8433, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3601850097818311, + "learning_rate": 0.00016011686558137448, + "loss": 0.8426, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.36336268281281814, + "learning_rate": 0.0001592823310385073, + "loss": 0.8459, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4007311050026622, + "learning_rate": 0.0001584413813397364, + "loss": 0.8332, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3279742689663737, + "learning_rate": 0.00015759410748727662, + "loss": 0.7898, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3937884919475543, + "learning_rate": 0.00015674060116770236, + "loss": 0.8439, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.4143083432212905, + "learning_rate": 0.00015588095474202595, + "loss": 0.8246, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.35676836384855826, + "learning_rate": 0.00015501526123570277, + "loss": 0.8377, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4418839450505997, + "learning_rate": 0.00015414361432856475, + "loss": 0.8546, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.38722861928111846, + "learning_rate": 0.0001532661083446829, + "loss": 0.8938, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4798810748731012, + "learning_rate": 0.00015238283824216015, + "loss": 0.933, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4192610184794806, + "learning_rate": 0.00015149389960285558, + "loss": 0.8314, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.37992908787234797, + "learning_rate": 0.00015059938862204127, + "loss": 0.8502, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3537079585046425, + "learning_rate": 0.00014969940209799248, + "loss": 0.837, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4947879619869325, + "learning_rate": 0.00014879403742151283, + "loss": 0.8655, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4144273342987794, + "learning_rate": 0.00014788339256539544, + "loss": 0.8024, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.42810201253462243, + "learning_rate": 0.0001469675660738206, + "loss": 0.8646, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.43495581513946346, + "learning_rate": 0.00014604665705169237, + "loss": 0.8019, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.40514388555481956, + "learning_rate": 0.00014512076515391375, + "loss": 0.8307, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5344793506502119, + "learning_rate": 0.00014418999057460276, + "loss": 0.8583, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3591362856868415, + "learning_rate": 0.0001432544340362501, + "loss": 0.8422, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.43800340514585073, + "learning_rate": 0.00014231419677881966, + "loss": 0.829, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.3546231000833001, + "learning_rate": 0.00014136938054879283, + "loss": 0.8502, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3824038358260862, + "learning_rate": 0.00014042008758815818, + "loss": 0.8601, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3773025369439645, + "learning_rate": 0.00013946642062334766, + "loss": 0.7483, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.40841295198664107, + "learning_rate": 0.00013850848285411994, + "loss": 0.8981, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.32297152126914114, + "learning_rate": 0.000137546377942393, + "loss": 0.7604, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.38908595736780166, + "learning_rate": 0.00013658021000102636, + "loss": 0.8377, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.30749072492921053, + "learning_rate": 0.00013561008358255468, + "loss": 0.6953, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.42279066677539, + "learning_rate": 0.00013463610366787392, + "loss": 0.8381, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.46835688525644387, + "learning_rate": 0.00013365837565488064, + "loss": 0.8801, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.38170959986063163, + "learning_rate": 0.0001326770053470668, + "loss": 0.8177, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3748992249270009, + "learning_rate": 0.0001316920989420703, + "loss": 0.7318, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.32117252663093104, + "learning_rate": 0.00013070376302018287, + "loss": 0.7911, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.32643252918835663, + "learning_rate": 0.00012971210453281674, + "loss": 0.7565, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.34707082290937796, + "learning_rate": 0.000128717230790931, + "loss": 0.8179, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.37747643787259155, + "learning_rate": 0.00012771924945341906, + "loss": 0.7647, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.4042549495002011, + "learning_rate": 0.00012671826851545851, + "loss": 0.8638, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.40682592869078416, + "learning_rate": 0.0001257143962968246, + "loss": 0.9235, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3394181019470551, + "learning_rate": 0.00012470774143016853, + "loss": 0.8029, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.37379683333378905, + "learning_rate": 0.00012369841284926188, + "loss": 0.8183, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.37760782532653564, + "learning_rate": 0.00012268651977720866, + "loss": 0.732, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.42926915905667945, + "learning_rate": 0.00012167217171462566, + "loss": 0.8645, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3353578654728262, + "learning_rate": 0.0001206554784277931, + "loss": 0.7573, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.36702732140537525, + "learning_rate": 0.00011963654993677645, + "loss": 0.7855, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3945829288474564, + "learning_rate": 0.00011861549650352069, + "loss": 0.9015, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3384991496539021, + "learning_rate": 0.00011759242861991855, + "loss": 0.7672, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.36235446616831335, + "learning_rate": 0.00011656745699585371, + "loss": 0.7298, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.37275162386027966, + "learning_rate": 0.00011554069254722051, + "loss": 0.8231, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3351601172488379, + "learning_rate": 0.00011451224638392129, + "loss": 0.78, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4177215304468739, + "learning_rate": 0.00011348222979784289, + "loss": 0.8148, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.31882281813401364, + "learning_rate": 0.00011245075425081328, + "loss": 0.8256, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.34927524739208016, + "learning_rate": 0.00011141793136253986, + "loss": 0.7206, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3272143865687314, + "learning_rate": 0.0001103838728985307, + "loss": 0.8422, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.38082561425167516, + "learning_rate": 0.000109348690758, + "loss": 0.801, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3683618334975772, + "learning_rate": 0.00010831249696175918, + "loss": 0.754, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3460529686559446, + "learning_rate": 0.0001072754036400944, + "loss": 0.7703, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.3906059155144969, + "learning_rate": 0.00010623752302063283, + "loss": 0.8635, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4498300529094466, + "learning_rate": 0.00010519896741619803, + "loss": 0.887, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.34890284462378934, + "learning_rate": 0.00010415984921265609, + "loss": 0.8415, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.35283761184137985, + "learning_rate": 0.00010312028085675391, + "loss": 0.7897, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3383893586354043, + "learning_rate": 0.00010208037484395114, + "loss": 0.7699, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.37944587951981884, + "learning_rate": 0.00010104024370624644, + "loss": 0.8168, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.35361574612426283, + "learning_rate": 0.0001, + "loss": 0.8237, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.49601519414630724, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8944, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.4329854234900553, + "learning_rate": 9.791962515604887e-05, + "loss": 0.8409, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.33753405466010045, + "learning_rate": 9.687971914324607e-05, + "loss": 0.6823, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.3641968735452542, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7999, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3412252144703986, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7485, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.36557961816073514, + "learning_rate": 9.376247697936719e-05, + "loss": 0.8047, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3143242351094908, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7417, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.37740381793108085, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7847, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.3756001432572221, + "learning_rate": 9.065130924199998e-05, + "loss": 0.8045, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.3886531783560117, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7739, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.35961818663851197, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7682, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.34291998146787495, + "learning_rate": 8.754924574918675e-05, + "loss": 0.8154, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.365623213677616, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7723, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.4154693198085254, + "learning_rate": 8.548775361607872e-05, + "loss": 0.8059, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.41853675355977066, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7936, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3029988536971098, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7538, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.35015284331125907, + "learning_rate": 8.240757138008149e-05, + "loss": 0.784, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.36899119805196373, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7712, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.4635909482238713, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8151, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.39324815381124656, + "learning_rate": 7.934452157220694e-05, + "loss": 0.8479, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3790446343866871, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7849, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.38585647860183997, + "learning_rate": 7.731348022279134e-05, + "loss": 0.8062, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3851371620272845, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8341, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.33007054542330805, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7197, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.42174584227102946, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8017, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.33067818645649066, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7886, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.42265554001971417, + "learning_rate": 7.228075054658096e-05, + "loss": 0.8238, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.34908037692220767, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7928, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.3700495310823468, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7491, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3535456489525407, + "learning_rate": 6.929623697981718e-05, + "loss": 0.8094, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.35992816828147584, + "learning_rate": 6.830790105792973e-05, + "loss": 0.747, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.34357552011988995, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7329, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4017091190478587, + "learning_rate": 6.63416243451194e-05, + "loss": 0.807, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.3092472351482309, + "learning_rate": 6.536389633212609e-05, + "loss": 0.7309, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3737027071808558, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8191, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.36607238095868505, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7998, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4042496034179054, + "learning_rate": 6.245362205760704e-05, + "loss": 0.773, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.37764639494375285, + "learning_rate": 6.149151714588009e-05, + "loss": 0.8055, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3348761239337774, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7443, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.37626992146366794, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7826, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3665662199313722, + "learning_rate": 5.863061945120719e-05, + "loss": 0.8335, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.33785931924439677, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7972, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3880113242436853, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8896, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.3721493819848271, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.8183, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5198646512996709, + "learning_rate": 5.487923484608629e-05, + "loss": 0.8566, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4065329950447661, + "learning_rate": 5.395334294830765e-05, + "loss": 0.9147, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.37859745509497567, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.8042, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3607765773764099, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7938, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.4092950714526401, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.806, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.30432228901089703, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7262, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.37088903131596684, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7169, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3904681317398879, + "learning_rate": 4.850610039714444e-05, + "loss": 0.8104, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.46898322895625755, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8688, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3006038153490141, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7169, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.37810250308131216, + "learning_rate": 4.585638567143529e-05, + "loss": 0.789, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.2989111481815409, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7303, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.32911143249763547, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7592, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.35549746840897417, + "learning_rate": 4.325939883229766e-05, + "loss": 0.8175, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.31577828848903333, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7581, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.38150936709065336, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7383, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.352873952674236, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7899, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3729395002008322, + "learning_rate": 3.988313441862553e-05, + "loss": 0.8267, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.38612103435747785, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.8438, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.3343773300394658, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7766, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.34821484213233017, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7869, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.34452187805524936, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.8142, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.389690433062319, + "learning_rate": 3.580982498073344e-05, + "loss": 0.75, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4645823764551201, + "learning_rate": 3.501565286440914e-05, + "loss": 0.9086, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.3374398934171241, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7671, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4111564099874274, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.83, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.4082259005237418, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.8499, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.35262858393716395, + "learning_rate": 3.191013424895536e-05, + "loss": 0.8093, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.41355266235382226, + "learning_rate": 3.115196713638e-05, + "loss": 0.8087, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.41241448924179946, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7911, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3279954208792422, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7616, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3710790783951858, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7417, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3638956295243486, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7734, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.45340861210629174, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.845, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.32432848532039027, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7292, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.33285800890422707, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.6951, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.36168512959628546, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7777, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3755110018820757, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.7907, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4746969071550545, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8731, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.3485421714780257, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7017, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3522895092881672, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7341, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3171714951862727, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.778, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.30854770168466894, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7326, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4491439626115905, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.8296, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.37090037131233355, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7953, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3834261067678261, + "learning_rate": 1.946674459180955e-05, + "loss": 0.9055, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3255432432056942, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7592, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.38151180353083647, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7883, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.43340018294551996, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.9072, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.4692009486980658, + "learning_rate": 1.707039794428259e-05, + "loss": 0.8892, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.34906122041207605, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7177, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.357075923035099, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.7832, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3853853663460861, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7875, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3162997323586251, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7652, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.5058370501510021, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.788, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.38934979473108083, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.8643, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.36714032393813667, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7469, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3488248102724531, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7318, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.294650221806305, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7344, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.3540730290394561, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.7354, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4226203562757057, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.808, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.4014616869239377, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7749, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3934215620886321, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.8003, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3353659014526165, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7851, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.3700720016155327, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7941, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.31100504311573046, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7298, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4172712953645561, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7686, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.34667441316708114, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7106, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3004518824396042, + "learning_rate": 7.711881868390291e-06, + "loss": 0.6887, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.38133602184118887, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.8172, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.33116090594086295, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7005, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3188298094332972, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7534, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4403999231950908, + "learning_rate": 6.189870894938587e-06, + "loss": 0.8213, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3608087635662593, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7844, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.40474388045501003, + "learning_rate": 5.489612626189245e-06, + "loss": 0.8244, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3389637770265784, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7018, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.30643971186231883, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7619, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3661184834317572, + "learning_rate": 4.516017865659949e-06, + "loss": 0.703, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3671129154556767, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7824, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.34883381196841734, + "learning_rate": 3.918559493838114e-06, + "loss": 0.8148, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.32439361737968153, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7575, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3866839936796524, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.8134, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3825547694903366, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7618, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.31827546790247363, + "learning_rate": 2.848647830172024e-06, + "loss": 0.7321, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.29623028045498556, + "learning_rate": 2.607383131993424e-06, + "loss": 0.6688, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.40429655628990896, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.8106, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3650008962489634, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7758, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.34654671660631003, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7488, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3417601789633415, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7376, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.4766922892989375, + "learning_rate": 1.559629951665298e-06, + "loss": 0.8448, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3764754677513324, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.756, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3744144624463219, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7769, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.41513981451373816, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7734, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.36679556444218603, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7664, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.3859580111548229, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7639, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.34312029909834285, + "learning_rate": 6.539842600603918e-07, + "loss": 0.8412, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4680442538250724, + "learning_rate": 5.405852438937764e-07, + "loss": 0.8093, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.3160450323270329, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.709, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.45915080458361, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.8064, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.3737969883986542, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.8277, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3789680381062684, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8404, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.35233074082938165, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7446, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3337846639484774, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7811, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.35005314677512966, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.8336, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.38107841486064986, + "learning_rate": 2.164213936770576e-08, + "loss": 0.768, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.37188047064663365, + "learning_rate": 5.410681219286673e-09, + "loss": 0.7323, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3528962453383034, + "learning_rate": 0.0, + "loss": 0.7924, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 736497759813632.0, + "train_loss": 0.8298983348485751, + "train_runtime": 9149.0946, + "train_samples_per_second": 1.093, + "train_steps_per_second": 0.034 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 736497759813632.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6d4056ad0dbfa8e52c3d650ef42c4176842a29a2 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "k_proj", + "down_proj", + "o_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bd05a13aa1ce47d33a2febabc444d886ceb92f02 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61659b7548812b91a870a85aa48869803be2f9ca23408f3b764320bcca5a24cc +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..df16aa2da0167ff7227b487b3ad71db9acb74e88 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf545e27d23f76953d9bdd19703edae1ee5f1f56973e863e6006b8a69c875de6 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fac7f60a2000570e9c2cc10cf22fb9865077b415 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 1.0096366749232497, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4374, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8948340509042738, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2714, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.9673649083623338, + "learning_rate": 3.157894736842105e-05, + "loss": 1.4828, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8440321862225117, + "learning_rate": 4.210526315789474e-05, + "loss": 1.3253, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.7091743584914151, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0561, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8684015209033978, + "learning_rate": 6.31578947368421e-05, + "loss": 1.262, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.8160514552806475, + "learning_rate": 7.368421052631579e-05, + "loss": 1.137, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.0206577145842695, + "learning_rate": 8.421052631578948e-05, + "loss": 1.2182, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 1.2355330127135409, + "learning_rate": 9.473684210526316e-05, + "loss": 1.0205, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 1.3760136594787162, + "learning_rate": 0.00010526315789473685, + "loss": 1.1951, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.9226219355030841, + "learning_rate": 0.00011578947368421053, + "loss": 1.2095, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6047620007662615, + "learning_rate": 0.0001263157894736842, + "loss": 1.0607, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6223476017826282, + "learning_rate": 0.0001368421052631579, + "loss": 0.9295, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6634867557367036, + "learning_rate": 0.00014736842105263158, + "loss": 1.0089, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.6942121550576138, + "learning_rate": 0.00015789473684210527, + "loss": 1.1134, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6517191782439055, + "learning_rate": 0.00016842105263157895, + "loss": 0.9435, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5988935455903787, + "learning_rate": 0.00017894736842105264, + "loss": 0.9725, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.7282000378549868, + "learning_rate": 0.00018947368421052632, + "loss": 0.9245, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.6923516692944924, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.7377312637478807, + "learning_rate": 0.00019999865623437013, + "loss": 0.9923, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.6991529429653036, + "learning_rate": 0.00019999462497359466, + "loss": 0.999, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.8442525167505112, + "learning_rate": 0.00019998790632601496, + "loss": 0.9775, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.6807612405968998, + "learning_rate": 0.0001999785004721968, + "loss": 1.0393, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.615363691022571, + "learning_rate": 0.00019996640766492543, + "loss": 0.9465, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.6560600622456343, + "learning_rate": 0.00019995162822919883, + "loss": 0.9816, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5510437232602844, + "learning_rate": 0.00019993416256221895, + "loss": 0.9114, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4954059060906038, + "learning_rate": 0.00019991401113338104, + "loss": 0.898, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6151884500603105, + "learning_rate": 0.00019989117448426108, + "loss": 0.9211, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.6750804410936759, + "learning_rate": 0.00019986565322860115, + "loss": 0.8359, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.6368391831488008, + "learning_rate": 0.00019983744805229296, + "loss": 1.0608, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5590401200511067, + "learning_rate": 0.00019980655971335945, + "loss": 0.8982, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.580488933744179, + "learning_rate": 0.00019977298904193437, + "loss": 0.829, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5417957562757602, + "learning_rate": 0.00019973673694024, + "loss": 0.8353, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5212173330120858, + "learning_rate": 0.00019969780438256293, + "loss": 0.9113, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.5403546577518331, + "learning_rate": 0.0001996561924152278, + "loss": 0.9352, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5861554161890207, + "learning_rate": 0.0001996119021565693, + "loss": 0.7743, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.47500481352954615, + "learning_rate": 0.0001995649347969019, + "loss": 0.8628, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5702370795138746, + "learning_rate": 0.00019951529159848805, + "loss": 0.969, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.699121456839057, + "learning_rate": 0.00019946297389550433, + "loss": 0.9597, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.5030183653307678, + "learning_rate": 0.00019940798309400526, + "loss": 0.881, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.4884693592955745, + "learning_rate": 0.0001993503206718859, + "loss": 0.8358, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5321361462838011, + "learning_rate": 0.00019928998817884182, + "loss": 0.955, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5495909582071099, + "learning_rate": 0.00019922698723632767, + "loss": 0.8622, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6073671597800279, + "learning_rate": 0.00019916131953751342, + "loss": 1.0347, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.4728710468686486, + "learning_rate": 0.00019909298684723904, + "loss": 0.7384, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.7776863529098328, + "learning_rate": 0.00019902199100196697, + "loss": 0.9723, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4965532465566332, + "learning_rate": 0.00019894833390973266, + "loss": 0.8489, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.6419371462528843, + "learning_rate": 0.00019887201755009357, + "loss": 0.9119, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4900525424240958, + "learning_rate": 0.0001987930439740757, + "loss": 0.918, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.7854429462112946, + "learning_rate": 0.00019871141530411853, + "loss": 1.0194, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.6104771933997867, + "learning_rate": 0.0001986271337340182, + "loss": 0.9132, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6330639296462902, + "learning_rate": 0.00019854020152886814, + "loss": 0.9517, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5137018102790541, + "learning_rate": 0.0001984506210249986, + "loss": 0.8625, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.567695702179213, + "learning_rate": 0.00019835839462991361, + "loss": 0.7726, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.6488471177320068, + "learning_rate": 0.00019826352482222638, + "loss": 0.9213, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5733470182435702, + "learning_rate": 0.00019816601415159263, + "loss": 0.9435, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5921455538031671, + "learning_rate": 0.0001980658652386421, + "loss": 0.8527, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.6450522077889146, + "learning_rate": 0.00019796308077490817, + "loss": 0.9521, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4846588081306062, + "learning_rate": 0.00019785766352275542, + "loss": 0.8395, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.5775568058546156, + "learning_rate": 0.00019774961631530545, + "loss": 0.9154, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.7377113840637111, + "learning_rate": 0.00019763894205636072, + "loss": 1.0698, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5605229143638871, + "learning_rate": 0.00019752564372032657, + "loss": 0.8817, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.7495342647669407, + "learning_rate": 0.00019740972435213115, + "loss": 1.0097, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.550991870286073, + "learning_rate": 0.00019729118706714375, + "loss": 0.9113, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.5425725126763408, + "learning_rate": 0.00019717003505109095, + "loss": 0.8622, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.6512932570483941, + "learning_rate": 0.00019704627155997108, + "loss": 0.9442, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5568640098957766, + "learning_rate": 0.00019691989991996663, + "loss": 1.0085, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5735714231784806, + "learning_rate": 0.0001967909235273549, + "loss": 0.9677, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5629368451300993, + "learning_rate": 0.00019665934584841682, + "loss": 0.9292, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.5221544806967634, + "learning_rate": 0.00019652517041934356, + "loss": 0.807, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5097003015787136, + "learning_rate": 0.00019638840084614182, + "loss": 0.8862, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.7130106931007483, + "learning_rate": 0.00019624904080453655, + "loss": 1.0205, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.5909339026171039, + "learning_rate": 0.00019610709403987246, + "loss": 0.8005, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.6205480027240491, + "learning_rate": 0.00019596256436701324, + "loss": 0.9101, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.5936367748834522, + "learning_rate": 0.000195815455670239, + "loss": 0.8914, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.6143835874726807, + "learning_rate": 0.00019566577190314197, + "loss": 1.0257, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.628841593688209, + "learning_rate": 0.0001955135170885202, + "loss": 0.8988, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.65749905037523, + "learning_rate": 0.00019535869531826937, + "loss": 0.9804, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.512109193461066, + "learning_rate": 0.00019520131075327298, + "loss": 0.8041, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.5743118892441462, + "learning_rate": 0.00019504136762329047, + "loss": 0.9568, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.5843364516301627, + "learning_rate": 0.00019487887022684336, + "loss": 1.0273, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.48304850535774974, + "learning_rate": 0.00019471382293110003, + "loss": 0.8177, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5673332343154109, + "learning_rate": 0.00019454623017175812, + "loss": 0.9013, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5256210172682816, + "learning_rate": 0.00019437609645292546, + "loss": 0.9962, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.5183245117161408, + "learning_rate": 0.0001942034263469989, + "loss": 0.879, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5788927945560342, + "learning_rate": 0.00019402822449454153, + "loss": 0.98, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5578708655181941, + "learning_rate": 0.00019385049560415794, + "loss": 0.8087, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5111618044176756, + "learning_rate": 0.00019367024445236754, + "loss": 0.8549, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.5162760624599195, + "learning_rate": 0.00019348747588347637, + "loss": 0.8345, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.6038786090279482, + "learning_rate": 0.00019330219480944694, + "loss": 0.8071, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.5764856734582621, + "learning_rate": 0.00019311440620976597, + "loss": 0.8512, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.6015667973666334, + "learning_rate": 0.0001929241151313108, + "loss": 0.9051, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.5150471254958291, + "learning_rate": 0.00019273132668821364, + "loss": 0.8612, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5666214066987976, + "learning_rate": 0.00019253604606172417, + "loss": 0.8885, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.5588852331852312, + "learning_rate": 0.00019233827850007027, + "loss": 0.8689, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.619713421628621, + "learning_rate": 0.00019213802931831696, + "loss": 0.9909, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.5268564220803525, + "learning_rate": 0.00019193530389822363, + "loss": 0.8578, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5514584533393524, + "learning_rate": 0.00019173010768809933, + "loss": 0.8725, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.47718826031070855, + "learning_rate": 0.0001915224462026563, + "loss": 0.8619, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.6802385479715022, + "learning_rate": 0.00019131232502286188, + "loss": 0.8541, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.5706682607862557, + "learning_rate": 0.0001910997497957885, + "loss": 0.8378, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5547872781612694, + "learning_rate": 0.00019088472623446183, + "loss": 0.8905, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.6894764479614839, + "learning_rate": 0.00019066726011770726, + "loss": 0.9693, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5932521455664701, + "learning_rate": 0.0001904473572899947, + "loss": 0.9117, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.5870941936867601, + "learning_rate": 0.00019022502366128135, + "loss": 0.8487, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.6797262861796305, + "learning_rate": 0.00019000026520685302, + "loss": 0.9535, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4716956058454649, + "learning_rate": 0.0001897730879671634, + "loss": 0.8018, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.49547117978913946, + "learning_rate": 0.00018954349804767184, + "loss": 0.8445, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4593534956395072, + "learning_rate": 0.00018931150161867916, + "loss": 0.8172, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.6000493180254652, + "learning_rate": 0.00018907710491516199, + "loss": 0.8511, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.5423254674909876, + "learning_rate": 0.0001888403142366049, + "loss": 0.9555, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.49179013108092784, + "learning_rate": 0.00018860113594683148, + "loss": 0.8105, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5168368220680261, + "learning_rate": 0.00018835957647383303, + "loss": 0.8747, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.6167191146008828, + "learning_rate": 0.00018811564230959588, + "loss": 0.9368, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.5340237099838037, + "learning_rate": 0.00018786934000992688, + "loss": 0.9434, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.46618744924150934, + "learning_rate": 0.00018762067619427746, + "loss": 0.799, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.45713405322430556, + "learning_rate": 0.00018736965754556528, + "loss": 0.8733, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5280690404047126, + "learning_rate": 0.00018711629080999504, + "loss": 0.8664, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4986696223613787, + "learning_rate": 0.00018686058279687698, + "loss": 0.9274, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.482812371619118, + "learning_rate": 0.00018660254037844388, + "loss": 0.8653, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.6464756221598743, + "learning_rate": 0.00018634217048966637, + "loss": 0.9778, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.49372856222969597, + "learning_rate": 0.0001860794801280666, + "loss": 0.9034, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.6101374114608618, + "learning_rate": 0.0001858144763535302, + "loss": 1.0121, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.5451785990596633, + "learning_rate": 0.0001855471662881164, + "loss": 0.9122, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.5325921981933636, + "learning_rate": 0.00018527755711586678, + "loss": 0.8264, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4374769818891512, + "learning_rate": 0.00018500565608261214, + "loss": 0.7988, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5792771553367638, + "learning_rate": 0.00018473147049577774, + "loss": 1.0061, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5482916874485357, + "learning_rate": 0.00018445500772418697, + "loss": 0.8481, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4732609287832736, + "learning_rate": 0.00018417627519786315, + "loss": 0.7663, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.568902212282601, + "learning_rate": 0.00018389528040783012, + "loss": 0.9781, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5611066018344876, + "learning_rate": 0.00018361203090591071, + "loss": 0.9428, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.569003642407584, + "learning_rate": 0.00018332653430452376, + "loss": 0.9228, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.49019024925155885, + "learning_rate": 0.00018303879827647975, + "loss": 0.9015, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.7476848004489727, + "learning_rate": 0.00018274883055477436, + "loss": 1.101, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.4729681535702118, + "learning_rate": 0.00018245663893238075, + "loss": 0.8703, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.41394830893342704, + "learning_rate": 0.00018216223126204007, + "loss": 0.7054, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.5595697193386046, + "learning_rate": 0.00018186561545605054, + "loss": 0.863, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.49087790464884334, + "learning_rate": 0.00018156679948605467, + "loss": 0.8048, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.5921116083659788, + "learning_rate": 0.00018126579138282503, + "loss": 0.9277, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.5938076846018375, + "learning_rate": 0.0001809625992360485, + "loss": 0.8797, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5213244559998247, + "learning_rate": 0.00018065723119410884, + "loss": 0.8714, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.7057083039155069, + "learning_rate": 0.00018034969546386757, + "loss": 1.0481, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.47407514110812166, + "learning_rate": 0.0001800400003104436, + "loss": 0.8425, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5417477528697916, + "learning_rate": 0.00017972815405699103, + "loss": 0.8626, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.41983025228173904, + "learning_rate": 0.00017941416508447536, + "loss": 0.755, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.7076736950806536, + "learning_rate": 0.0001790980418314484, + "loss": 0.9116, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5234432801669066, + "learning_rate": 0.00017877979279382135, + "loss": 0.811, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5968616055060801, + "learning_rate": 0.0001784594265246366, + "loss": 0.895, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.793915770912191, + "learning_rate": 0.0001781369516338378, + "loss": 0.8595, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.6725638184434034, + "learning_rate": 0.00017781237678803847, + "loss": 0.8935, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.6972977360166016, + "learning_rate": 0.000177485710710289, + "loss": 0.8783, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5313749491420932, + "learning_rate": 0.00017715696217984235, + "loss": 0.8844, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.5373046766235955, + "learning_rate": 0.00017682614003191807, + "loss": 0.8982, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5335095314794713, + "learning_rate": 0.00017649325315746478, + "loss": 0.8746, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.4444566104179798, + "learning_rate": 0.0001761583105029213, + "loss": 0.7231, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5028375661309075, + "learning_rate": 0.00017582132106997616, + "loss": 0.8662, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4991388775797579, + "learning_rate": 0.00017548229391532572, + "loss": 0.8282, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.47893484287330357, + "learning_rate": 0.00017514123815043074, + "loss": 0.821, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.5067770890271315, + "learning_rate": 0.00017479816294127152, + "loss": 0.7882, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.5165413509356288, + "learning_rate": 0.0001744530775081015, + "loss": 0.8471, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5066161228181565, + "learning_rate": 0.0001741059911251997, + "loss": 0.9031, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.547029143598491, + "learning_rate": 0.000173756913120621, + "loss": 0.8558, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5612378280968916, + "learning_rate": 0.00017340585287594604, + "loss": 0.9072, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.6076830542007384, + "learning_rate": 0.0001730528198260285, + "loss": 0.9033, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.5216970388907911, + "learning_rate": 0.00017269782345874203, + "loss": 0.7836, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.54503973121516, + "learning_rate": 0.00017234087331472497, + "loss": 0.9014, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.5231650990525085, + "learning_rate": 0.00017198197898712404, + "loss": 0.8632, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5497366992380714, + "learning_rate": 0.00017162115012133643, + "loss": 0.9476, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.487884829363121, + "learning_rate": 0.00017125839641475072, + "loss": 0.923, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.4527026078347616, + "learning_rate": 0.00017089372761648616, + "loss": 0.7776, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.5636470844990181, + "learning_rate": 0.00017052715352713075, + "loss": 0.9261, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5861311972784812, + "learning_rate": 0.00017015868399847768, + "loss": 0.9221, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.5761782641318623, + "learning_rate": 0.00016978832893326074, + "loss": 0.8741, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4680203522610832, + "learning_rate": 0.00016941609828488807, + "loss": 0.8434, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.6785300369860278, + "learning_rate": 0.0001690420020571747, + "loss": 0.9182, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4488327447605338, + "learning_rate": 0.0001686660503040737, + "loss": 0.8132, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.5089109530270849, + "learning_rate": 0.00016828825312940592, + "loss": 0.9513, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5302057448662424, + "learning_rate": 0.0001679086206865886, + "loss": 0.9494, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.46947064555349594, + "learning_rate": 0.00016752716317836229, + "loss": 0.7372, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.5028005654161093, + "learning_rate": 0.0001671438908565167, + "loss": 0.7989, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.567498817932682, + "learning_rate": 0.00016675881402161536, + "loss": 0.8853, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4942081354946293, + "learning_rate": 0.0001663719430227186, + "loss": 0.8184, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.5062989816972167, + "learning_rate": 0.00016598328825710533, + "loss": 0.8978, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4493194997623283, + "learning_rate": 0.000165592860169994, + "loss": 0.7454, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.4896729559250561, + "learning_rate": 0.00016520066925426144, + "loss": 0.835, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.547081954048003, + "learning_rate": 0.0001648067260501611, + "loss": 0.9449, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.5275769947570887, + "learning_rate": 0.0001644110411450398, + "loss": 0.8247, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.6796089522335953, + "learning_rate": 0.00016401362517305296, + "loss": 0.8879, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5234490468144752, + "learning_rate": 0.00016361448881487914, + "loss": 0.8826, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.5277313141615699, + "learning_rate": 0.00016321364279743266, + "loss": 0.8201, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.5308683592824113, + "learning_rate": 0.0001628110978935756, + "loss": 0.8342, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.48418803817401984, + "learning_rate": 0.00016240686492182804, + "loss": 0.8408, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.5928255690072303, + "learning_rate": 0.00016200095474607753, + "loss": 0.9769, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5171192071858057, + "learning_rate": 0.00016159337827528685, + "loss": 0.772, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.4809784376625068, + "learning_rate": 0.0001611841464632011, + "loss": 0.8239, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.5779211230055995, + "learning_rate": 0.0001607732703080532, + "loss": 0.8529, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.47891105435481995, + "learning_rate": 0.00016036076085226814, + "loss": 0.8373, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4735620706120006, + "learning_rate": 0.0001599466291821666, + "loss": 0.8068, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.495276098160408, + "learning_rate": 0.0001595308864276666, + "loss": 0.8535, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.5419221465233577, + "learning_rate": 0.0001591135437619847, + "loss": 0.9133, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4504332420679137, + "learning_rate": 0.0001586946124013354, + "loss": 0.7281, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5600527896983009, + "learning_rate": 0.0001582741036046301, + "loss": 0.92, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.6393077989847463, + "learning_rate": 0.00015785202867317407, + "loss": 0.8677, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.41150204685915415, + "learning_rate": 0.00015742839895036305, + "loss": 0.7436, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.556864628656174, + "learning_rate": 0.00015700322582137827, + "loss": 0.8779, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5291233938095491, + "learning_rate": 0.0001565765207128805, + "loss": 0.7963, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.5046584072913672, + "learning_rate": 0.0001561482950927029, + "loss": 0.8654, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5240103584920848, + "learning_rate": 0.00015571856046954285, + "loss": 0.8112, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5044515321240335, + "learning_rate": 0.00015528732839265272, + "loss": 0.8705, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.4938024171655887, + "learning_rate": 0.0001548546104515294, + "loss": 0.8458, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5781772950745199, + "learning_rate": 0.00015442041827560274, + "loss": 0.9357, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4885521758927064, + "learning_rate": 0.00015398476353392323, + "loss": 0.8204, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.5716873482544789, + "learning_rate": 0.00015354765793484834, + "loss": 0.9514, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.5310263888459398, + "learning_rate": 0.00015310911322572753, + "loss": 0.8385, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.7282643031114008, + "learning_rate": 0.000152669141192587, + "loss": 1.052, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5388177337573673, + "learning_rate": 0.00015222775365981273, + "loss": 0.8633, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.6082153413889909, + "learning_rate": 0.00015178496248983254, + "loss": 0.8942, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4464166985290885, + "learning_rate": 0.00015134077958279765, + "loss": 0.7651, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5067701779711128, + "learning_rate": 0.00015089521687626243, + "loss": 0.8417, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.5422232377837102, + "learning_rate": 0.000150448286344864, + "loss": 0.8779, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.5080300860627728, + "learning_rate": 0.00015000000000000001, + "loss": 0.8594, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.5405630172426878, + "learning_rate": 0.00014955036988950618, + "loss": 0.8484, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.469049466904632, + "learning_rate": 0.00014909940809733222, + "loss": 0.7741, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.8183089260837014, + "learning_rate": 0.00014864712674321734, + "loss": 1.0074, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.565514371396207, + "learning_rate": 0.00014819353798236427, + "loss": 0.7484, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4560321672187736, + "learning_rate": 0.00014773865400511272, + "loss": 0.8308, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.5094977032065641, + "learning_rate": 0.00014728248703661182, + "loss": 0.7723, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.5465252160330539, + "learning_rate": 0.00014682504933649144, + "loss": 0.8305, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.49877118353457706, + "learning_rate": 0.00014636635319853275, + "loss": 0.8814, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.4757523663635892, + "learning_rate": 0.00014590641095033787, + "loss": 0.8468, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.570445975575914, + "learning_rate": 0.00014544523495299842, + "loss": 0.8768, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5220926560957806, + "learning_rate": 0.0001449828376007636, + "loss": 0.8339, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.5963199291731878, + "learning_rate": 0.0001445192313207067, + "loss": 0.8413, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.6539066454841529, + "learning_rate": 0.0001440544285723915, + "loss": 0.9081, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.5330887389011021, + "learning_rate": 0.00014358844184753712, + "loss": 0.8046, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5286425366443326, + "learning_rate": 0.00014312128366968243, + "loss": 0.8227, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.5786370821844616, + "learning_rate": 0.00014265296659384956, + "loss": 0.848, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4410630748313712, + "learning_rate": 0.00014218350320620624, + "loss": 0.7533, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5041131428774268, + "learning_rate": 0.0001417129061237278, + "loss": 0.9019, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.5778662455305288, + "learning_rate": 0.00014124118799385796, + "loss": 0.9103, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.5195096672679763, + "learning_rate": 0.00014076836149416887, + "loss": 0.7727, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5788932105493327, + "learning_rate": 0.0001402944393320206, + "loss": 0.9388, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.5003143934702647, + "learning_rate": 0.00013981943424421932, + "loss": 0.844, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.5090658710909317, + "learning_rate": 0.00013934335899667527, + "loss": 0.8011, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.6686484237982562, + "learning_rate": 0.00013886622638405952, + "loss": 0.8939, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4700819361928644, + "learning_rate": 0.00013838804922946027, + "loss": 0.7804, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.477707037218358, + "learning_rate": 0.00013790884038403795, + "loss": 0.8647, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.5100411750377246, + "learning_rate": 0.00013742861272668012, + "loss": 0.8308, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.5520174664334276, + "learning_rate": 0.00013694737916365517, + "loss": 0.7925, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.5345884806512073, + "learning_rate": 0.00013646515262826552, + "loss": 0.8837, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.4212316530799322, + "learning_rate": 0.0001359819460805001, + "loss": 0.732, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4897164868688977, + "learning_rate": 0.0001354977725066859, + "loss": 0.825, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.49605395233621785, + "learning_rate": 0.00013501264491913906, + "loss": 0.865, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.6625415135845697, + "learning_rate": 0.0001345265763558152, + "loss": 0.8593, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.553170643987391, + "learning_rate": 0.00013403957987995882, + "loss": 0.8705, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.6643025128072408, + "learning_rate": 0.0001335516685797525, + "loss": 0.9046, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.5894795402676914, + "learning_rate": 0.00013306285556796495, + "loss": 0.9416, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.48140328572847857, + "learning_rate": 0.00013257315398159864, + "loss": 0.8482, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.40453568550694935, + "learning_rate": 0.00013208257698153677, + "loss": 0.7391, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.5164507723111476, + "learning_rate": 0.00013159113775218964, + "loss": 0.7956, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4615978212018176, + "learning_rate": 0.00013109884950114007, + "loss": 0.8318, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3542694167031475, + "learning_rate": 0.00013060572545878875, + "loss": 0.7276, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4704020278094193, + "learning_rate": 0.00013011177887799845, + "loss": 0.7627, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.51176404471867, + "learning_rate": 0.00012961702303373795, + "loss": 0.7088, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.5172875242169087, + "learning_rate": 0.00012912147122272523, + "loss": 0.8318, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4818432249996395, + "learning_rate": 0.00012862513676307008, + "loss": 0.8178, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.53278599717518, + "learning_rate": 0.00012812803299391628, + "loss": 0.793, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.5089707307645357, + "learning_rate": 0.00012763017327508305, + "loss": 0.7884, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.5844341690431064, + "learning_rate": 0.0001271315709867059, + "loss": 0.9081, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.47824424220999784, + "learning_rate": 0.00012663223952887723, + "loss": 0.7801, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4688946946883383, + "learning_rate": 0.00012613219232128608, + "loss": 0.8726, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.5863951665528098, + "learning_rate": 0.00012563144280285741, + "loss": 0.9242, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.4986765195767016, + "learning_rate": 0.00012513000443139112, + "loss": 0.8647, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.44194973191631093, + "learning_rate": 0.00012462789068320017, + "loss": 0.7731, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.524763930809003, + "learning_rate": 0.00012412511505274844, + "loss": 0.7547, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4791715166387513, + "learning_rate": 0.00012362169105228826, + "loss": 0.8751, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5291947028572765, + "learning_rate": 0.000123117632211497, + "loss": 0.8063, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.5291253034564499, + "learning_rate": 0.00012261295207711346, + "loss": 0.8164, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.6087123742013226, + "learning_rate": 0.0001221076642125742, + "loss": 0.8764, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.5088002178570323, + "learning_rate": 0.00012160178219764837, + "loss": 0.9018, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.5482575319396714, + "learning_rate": 0.00012109531962807332, + "loss": 0.7488, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4918049872566382, + "learning_rate": 0.00012058829011518896, + "loss": 0.785, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.48992359397276064, + "learning_rate": 0.00012008070728557186, + "loss": 0.8477, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.47183065490104775, + "learning_rate": 0.00011957258478066931, + "loss": 0.7776, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.48219118142939943, + "learning_rate": 0.00011906393625643244, + "loss": 0.781, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5623320974572162, + "learning_rate": 0.00011855477538294935, + "loss": 0.8046, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4304120017785138, + "learning_rate": 0.00011804511584407763, + "loss": 0.7408, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5138640645168956, + "learning_rate": 0.00011753497133707679, + "loss": 0.8282, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.5168258684089355, + "learning_rate": 0.00011702435557223987, + "loss": 0.8046, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.4901583230171197, + "learning_rate": 0.00011651328227252517, + "loss": 0.7845, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4501375535429379, + "learning_rate": 0.00011600176517318741, + "loss": 0.8194, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.5752810272957825, + "learning_rate": 0.00011548981802140848, + "loss": 0.9429, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4843449013623304, + "learning_rate": 0.00011497745457592816, + "loss": 0.7618, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4389650913586527, + "learning_rate": 0.00011446468860667421, + "loss": 0.8281, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.38966803549086076, + "learning_rate": 0.00011395153389439233, + "loss": 0.6863, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6881335554250402, + "learning_rate": 0.00011343800423027582, + "loss": 0.8981, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4368376239595057, + "learning_rate": 0.0001129241134155949, + "loss": 0.8062, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.47980937375435045, + "learning_rate": 0.00011240987526132594, + "loss": 0.8022, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.5063687813395686, + "learning_rate": 0.00011189530358778005, + "loss": 0.8474, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.517176521428431, + "learning_rate": 0.00011138041222423177, + "loss": 0.8625, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.525563910745772, + "learning_rate": 0.00011086521500854745, + "loss": 0.8449, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.43808905380694463, + "learning_rate": 0.00011034972578681338, + "loss": 0.8178, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.5380704534450393, + "learning_rate": 0.00010983395841296348, + "loss": 0.8145, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.46864468049328967, + "learning_rate": 0.00010931792674840718, + "loss": 0.8781, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.6074831901623694, + "learning_rate": 0.00010880164466165674, + "loss": 0.9485, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.41120141930841214, + "learning_rate": 0.00010828512602795462, + "loss": 0.6888, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.5125835564124221, + "learning_rate": 0.00010776838472890065, + "loss": 0.7902, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4531825151746322, + "learning_rate": 0.00010725143465207867, + "loss": 0.7998, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5178564006244549, + "learning_rate": 0.00010673428969068364, + "loss": 0.8318, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.5414206162664771, + "learning_rate": 0.00010621696374314807, + "loss": 0.8915, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.5253911905963955, + "learning_rate": 0.00010569947071276847, + "loss": 0.839, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.624540846119554, + "learning_rate": 0.00010518182450733186, + "loss": 0.95, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.39705628784380886, + "learning_rate": 0.00010466403903874176, + "loss": 0.6871, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4799080580696934, + "learning_rate": 0.00010414612822264455, + "loss": 0.9107, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.5105695206974173, + "learning_rate": 0.00010362810597805526, + "loss": 0.8726, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.49379073641651894, + "learning_rate": 0.0001031099862269837, + "loss": 0.7204, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4591049058322344, + "learning_rate": 0.00010259178289406011, + "loss": 0.8084, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.48025358544189095, + "learning_rate": 0.00010207350990616107, + "loss": 0.8382, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.5409338192777509, + "learning_rate": 0.0001015551811920351, + "loss": 0.813, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.5043038446677717, + "learning_rate": 0.00010103681068192845, + "loss": 0.8907, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.48909846693864667, + "learning_rate": 0.00010051841230721065, + "loss": 0.7788, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.42189383231534316, + "learning_rate": 0.0001, + "loss": 0.7254, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.6498002750364892, + "learning_rate": 9.948158769278939e-05, + "loss": 0.9458, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.5860702994817408, + "learning_rate": 9.896318931807155e-05, + "loss": 0.8266, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.5226966210384961, + "learning_rate": 9.844481880796491e-05, + "loss": 0.8119, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6643669732939301, + "learning_rate": 9.792649009383899e-05, + "loss": 0.9326, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.43800478347384875, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6661, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.5243694827992629, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7644, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.45520338789668585, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7966, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.43992966853064996, + "learning_rate": 9.585387177735547e-05, + "loss": 0.708, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.48807994095775004, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7901, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.40728153509013215, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6695, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.5246214434675819, + "learning_rate": 9.430052928723153e-05, + "loss": 0.8237, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4145944652837226, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7821, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.385472132727159, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7369, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4082563502752439, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7811, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.4961172882055245, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7675, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5595008692081654, + "learning_rate": 9.171487397204539e-05, + "loss": 0.8228, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.5736206767177207, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7259, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.48020249717554114, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7927, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.45753509008065996, + "learning_rate": 9.016604158703654e-05, + "loss": 0.742, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5851829562502802, + "learning_rate": 8.965027421318665e-05, + "loss": 0.9126, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4498037450909005, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7648, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4802454965679165, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7847, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.541887107283236, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7783, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.37235854465942453, + "learning_rate": 8.759012473867407e-05, + "loss": 0.715, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.5467283085990748, + "learning_rate": 8.707588658440511e-05, + "loss": 0.808, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4112013023922369, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7234, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.5217650746703775, + "learning_rate": 8.604846610560771e-05, + "loss": 0.8946, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.5472679043608458, + "learning_rate": 8.553531139332582e-05, + "loss": 0.9063, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.5696625213498148, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7161, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4604926826480909, + "learning_rate": 8.451018197859153e-05, + "loss": 0.899, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.398273916569123, + "learning_rate": 8.399823482681262e-05, + "loss": 0.7019, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4701540042555817, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7066, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.5286874784470306, + "learning_rate": 8.297564442776014e-05, + "loss": 0.8726, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4697754629410804, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7847, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.47148247200066923, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7888, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.7228789592171904, + "learning_rate": 8.144522461705067e-05, + "loss": 0.8493, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.571409757185734, + "learning_rate": 8.093606374356759e-05, + "loss": 0.9002, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.6996011540301896, + "learning_rate": 8.042741521933071e-05, + "loss": 0.9761, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.5315193279605486, + "learning_rate": 7.991929271442817e-05, + "loss": 0.9298, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5173413967085707, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7964, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.49680647790677457, + "learning_rate": 7.89046803719267e-05, + "loss": 0.8238, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4823969047811752, + "learning_rate": 7.839821780235168e-05, + "loss": 0.7636, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.5240306301645788, + "learning_rate": 7.789233578742582e-05, + "loss": 0.824, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.5044596704389451, + "learning_rate": 7.738704792288655e-05, + "loss": 0.8629, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.6192386484258986, + "learning_rate": 7.688236778850306e-05, + "loss": 0.8826, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.44443525426370883, + "learning_rate": 7.637830894771175e-05, + "loss": 0.826, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3859963085998379, + "learning_rate": 7.587488494725157e-05, + "loss": 0.693, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.4070405251672028, + "learning_rate": 7.537210931679987e-05, + "loss": 0.717, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.5190833271778817, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7967, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.547840043801605, + "learning_rate": 7.43685571971426e-05, + "loss": 0.8314, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.3432880095460033, + "learning_rate": 7.386780767871397e-05, + "loss": 0.647, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.5316470427477119, + "learning_rate": 7.336776047112276e-05, + "loss": 0.863, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.4811809918367705, + "learning_rate": 7.286842901329412e-05, + "loss": 0.8589, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.587522593255453, + "learning_rate": 7.236982672491698e-05, + "loss": 0.9239, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.5101390040859074, + "learning_rate": 7.187196700608373e-05, + "loss": 0.8205, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.47299161769213066, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7769, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.4980906621494592, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7362, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.45084007543847454, + "learning_rate": 7.038297696626206e-05, + "loss": 0.8285, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.4836606210247619, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7968, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3951160758966632, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7392, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.45778817160921503, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7986, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4843763278989223, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7425, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.45512976181686055, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7525, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.7081591028683811, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7895, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.39282677609329236, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7479, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.5164174239408281, + "learning_rate": 6.644833142024751e-05, + "loss": 0.8474, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.45594928656170175, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7878, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.45416952920551357, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7734, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.5455294041502883, + "learning_rate": 6.498735508086093e-05, + "loss": 0.8201, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.35790769735856265, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7394, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.5229477084868099, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7682, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.47717369586725983, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7614, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.5037374425648201, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7593, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.5954151705753504, + "learning_rate": 6.25713872733199e-05, + "loss": 0.8331, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.40732967945556364, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6671, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.5883567547120468, + "learning_rate": 6.161195077053976e-05, + "loss": 1.0437, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.36897944283799283, + "learning_rate": 6.113377361594049e-05, + "loss": 0.669, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.5462323767313574, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7599, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.5457773022621287, + "learning_rate": 6.018056575578075e-05, + "loss": 0.8281, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5010164358058302, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6825, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.4512123705344787, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7603, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.48649071526542725, + "learning_rate": 5.875881200614207e-05, + "loss": 0.898, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.39953097613155114, + "learning_rate": 5.828709387627218e-05, + "loss": 0.77, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4872181690140072, + "learning_rate": 5.781649679379378e-05, + "loss": 0.8275, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.43255402627612966, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7495, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.6018477874503864, + "learning_rate": 5.687871633031754e-05, + "loss": 0.8628, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5181961379417853, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.8858, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.47762880599177837, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.8257, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.666641390882161, + "learning_rate": 5.54807686792933e-05, + "loss": 0.8793, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5921649166111956, + "learning_rate": 5.501716239923642e-05, + "loss": 0.8918, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.5273286616462989, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.8519, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.5399503768360546, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.9754, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.4764747755157641, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7807, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4952819078850315, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7823, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.48267213098026895, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7564, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4463758435676959, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7389, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.43327085763464696, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6847, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.6036131191772912, + "learning_rate": 5.135287325678271e-05, + "loss": 0.8528, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.42564486228965653, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7601, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4677345793076768, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7813, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.4323200788320825, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6346, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4308229821971401, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7071, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.5102181049077796, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.8207, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5516546810893967, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7675, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.6743727371325823, + "learning_rate": 4.821503751016746e-05, + "loss": 0.8203, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.6068926449512423, + "learning_rate": 4.777224634018732e-05, + "loss": 0.794, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.7362811712148732, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7252, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.4210000906061205, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7346, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.5725440779433182, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7607, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5351904912564988, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6999, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.425204369840092, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7229, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4356200944892181, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7624, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.46971976933807047, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7964, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4742280691806812, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7456, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.5749318956736674, + "learning_rate": 4.385170490729712e-05, + "loss": 0.9022, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.44099195153929255, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7516, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.45087710875438175, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.7756, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.4574012541894237, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7861, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.4044503407198491, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.687, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.5482144557534906, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6576, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4072979799478154, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7551, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5519913893567424, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7751, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.427572422181255, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7599, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5955275501300678, + "learning_rate": 4.00533708178334e-05, + "loss": 0.8783, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5517377691278346, + "learning_rate": 3.963923914773187e-05, + "loss": 0.8424, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5541228187635037, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7198, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3991511164885935, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.732, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.5060165936708827, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7768, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.48672533089925063, + "learning_rate": 3.79990452539225e-05, + "loss": 0.8013, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5253671257368212, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7999, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.3633770596188525, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7074, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.449494471923052, + "learning_rate": 3.678635720256737e-05, + "loss": 0.8176, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.432320602559632, + "learning_rate": 3.638551118512089e-05, + "loss": 0.684, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4434694550352049, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7871, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.8206516083660113, + "learning_rate": 3.558895885496023e-05, + "loss": 0.9475, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4937657086859226, + "learning_rate": 3.519327394983888e-05, + "loss": 0.7522, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.49440865555601776, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7201, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.4175034107778847, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7441, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.5157776265243595, + "learning_rate": 3.401671174289469e-05, + "loss": 0.8169, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5661252354929321, + "learning_rate": 3.362805697728145e-05, + "loss": 0.8418, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.4601424298620791, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7333, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.526257932261844, + "learning_rate": 3.285610914348332e-05, + "loss": 0.9005, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.49758181851348093, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7995, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4743048138130881, + "learning_rate": 3.209137931341143e-05, + "loss": 0.7818, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4107871656143851, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7582, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.6125756169265194, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.8057, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.4956432012606571, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.768, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.5597263000321794, + "learning_rate": 3.058390171511196e-05, + "loss": 0.9382, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.39974074026990225, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7135, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4944138228262104, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7995, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.45886221709901115, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.7852, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4516610392627582, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7456, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.557783936084678, + "learning_rate": 2.874160358524931e-05, + "loss": 0.8968, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4856834956172401, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7805, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.6565524280130758, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.8707, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.6710177605881805, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7805, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.42889943601873365, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7535, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.4333118125395268, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.7255, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5839841393173936, + "learning_rate": 2.659414712405398e-05, + "loss": 0.7486, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4521895534934514, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7484, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.520280552073544, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.812, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.45670805261710384, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.8, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.5335285014546386, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.922, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4881079791204276, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7751, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.44987938879675854, + "learning_rate": 2.451770608467432e-05, + "loss": 0.7279, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.6833009708779766, + "learning_rate": 2.417867893002387e-05, + "loss": 0.9617, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4250791221610793, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7148, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.5399826159178152, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7702, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.43743477925604757, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7157, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.40297369122305954, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.7481, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.49175256105542803, + "learning_rate": 2.251428928971102e-05, + "loss": 0.8124, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4373359544357551, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.737, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.54633843763496, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7988, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.41604550782919175, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6769, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.5664311821278589, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.7408, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5897444993781782, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.8862, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.5089631988157485, + "learning_rate": 2.058583491552465e-05, + "loss": 0.7385, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.5607696948444847, + "learning_rate": 2.027184594300898e-05, + "loss": 0.8308, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.5036784353906988, + "learning_rate": 1.995999968955641e-05, + "loss": 0.8341, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4874499905555397, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7482, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4893206979563326, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7605, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.46314958256468436, + "learning_rate": 1.903740076395151e-05, + "loss": 0.7855, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.461567993549935, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6404, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.6181078155420232, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.8067, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.4750427640337518, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6885, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4778704552565716, + "learning_rate": 1.783776873795994e-05, + "loss": 0.7967, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.8052259392554215, + "learning_rate": 1.754336106761927e-05, + "loss": 1.1212, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.6065221233351177, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7793, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.4023324353845839, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7498, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.43096596907532625, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6619, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.5331475367249551, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7365, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.46125235258257047, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.7711, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.5770787921259529, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.8073, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.35681059458476244, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6854, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.47730334936577407, + "learning_rate": 1.526852950422226e-05, + "loss": 0.8211, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.446073273341204, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7873, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.5828635553989929, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.9511, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.6094073934049821, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.794, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.6164796392342045, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.9665, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.42342672248834473, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7665, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.6765032297992837, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.8345, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.5064304818411385, + "learning_rate": 1.339745962155613e-05, + "loss": 0.707, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.40089275275048747, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6586, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.5483517201907702, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7828, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.360374178412597, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6721, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4454048336442163, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.774, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3721989071970144, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6956, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.5985047489125143, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.9089, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.5237948237809888, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.8306, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.8648512711778988, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7997, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.6197763413747968, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.7867, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.42470141034649467, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6804, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.4856713864563471, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.8431, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5330578663809168, + "learning_rate": 1.045650195232819e-05, + "loss": 0.8066, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4947389161075935, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7729, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.41545998015118474, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7083, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.485544883719098, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7167, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.6386894810307258, + "learning_rate": 9.552642710005299e-06, + "loss": 0.7929, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.46002750542797544, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7106, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.46868454363851236, + "learning_rate": 9.115273765538202e-06, + "loss": 0.7373, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.7144007673862546, + "learning_rate": 8.900250204211514e-06, + "loss": 0.7063, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5069051513657338, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7728, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.5803596449574296, + "learning_rate": 8.47755379734373e-06, + "loss": 0.767, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4182408173362448, + "learning_rate": 8.269892311900696e-06, + "loss": 0.7044, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.4354868248843499, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7385, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.41941309017306944, + "learning_rate": 7.861970681683051e-06, + "loss": 0.7395, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.5371051126727617, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7964, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.5771696475203609, + "learning_rate": 7.463953938275858e-06, + "loss": 0.8346, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.4382500643697019, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.693, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.43605868921778695, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6877, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4165356960116476, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6415, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4129667645934099, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7575, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.6351768163984158, + "learning_rate": 6.512524116523633e-06, + "loss": 0.9496, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.41141560339732675, + "learning_rate": 6.329755547632499e-06, + "loss": 0.7321, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.5023405131637189, + "learning_rate": 6.149504395842087e-06, + "loss": 0.7426, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.42821444755882565, + "learning_rate": 5.971775505458444e-06, + "loss": 0.7429, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5459880627672273, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.9019, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.46115758430714615, + "learning_rate": 5.623903547074549e-06, + "loss": 0.7585, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.44139000235319503, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6966, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.4543809462072909, + "learning_rate": 5.286177068899989e-06, + "loss": 0.7594, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4495748022007052, + "learning_rate": 5.121129773156663e-06, + "loss": 0.8066, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.46347153258550544, + "learning_rate": 4.95863237670956e-06, + "loss": 0.7445, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.5280086571585247, + "learning_rate": 4.798689246727006e-06, + "loss": 0.7395, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.5660248162122429, + "learning_rate": 4.641304681730641e-06, + "loss": 0.8576, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.6919236328983153, + "learning_rate": 4.486482911479839e-06, + "loss": 0.8169, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3978352567607781, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.7173, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.46468456208961195, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7637, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.5054514384703425, + "learning_rate": 4.037435632986786e-06, + "loss": 0.8906, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.42486805180548026, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7185, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.49231465713121064, + "learning_rate": 3.750959195463466e-06, + "loss": 0.8126, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4264580566405121, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6775, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.5889294212994212, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.8009, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.5146199091910297, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7888, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5839724540785394, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7026, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.4683585152611421, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.7459, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.4079869469017918, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7465, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.46769691407696456, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.7662, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.4261406956454604, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7229, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4667058373952707, + "learning_rate": 2.590275647868867e-06, + "loss": 0.7459, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.6382869126429721, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.8836, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4921557339698548, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7706, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.48839344968871246, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7637, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.5485656778087554, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7947, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.5328073776988045, + "learning_rate": 2.036919225091827e-06, + "loss": 0.8176, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4746265132334152, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7356, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.5354721956600221, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.8329, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.5196997436322807, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.8083, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.4919705407620299, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.71, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.5509294451771631, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.9046, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4322044034560731, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7115, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.5441241973237635, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7648, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.42712086843003794, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7394, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.5224691303715251, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.776, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.5054549573817237, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.8071, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.5423938658985061, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.7444, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5593259969244396, + "learning_rate": 9.780089980330642e-07, + "loss": 0.8354, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.6699218671190321, + "learning_rate": 9.070131527609604e-07, + "loss": 0.717, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.549979447808889, + "learning_rate": 8.386804624865851e-07, + "loss": 0.8225, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.43589293457086764, + "learning_rate": 7.730127636723539e-07, + "loss": 0.7761, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5393452630062003, + "learning_rate": 7.100118211581852e-07, + "loss": 0.8387, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.7906770173799234, + "learning_rate": 6.496793281141056e-07, + "loss": 0.785, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.45367166208698123, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6509, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.41705664578435075, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6741, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.5138034548727115, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7611, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.6706363126180388, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.8844, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.5175012983585358, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.7132, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.47568378499503156, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7699, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.5267143620594829, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.7824, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.5540117150452298, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.8809, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.36858401023066945, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.7154, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4195343631350608, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6821, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4821407975273321, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.8118, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.4541170099073794, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.769, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.5000787751047758, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.8309, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.46285766823963, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7921, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.48469196593187697, + "learning_rate": 6.583743778106887e-08, + "loss": 0.8265, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.5571591206945543, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6982, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.43003274517721807, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7066, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4022428797384306, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6786, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5569976399581428, + "learning_rate": 1.209367398504746e-08, + "loss": 0.7958, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.45222096782817917, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7226, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.48513423816842716, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.7292, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.47414292362264654, + "learning_rate": 0.0, + "loss": 0.7807, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 508727619158016.0, + "train_loss": 0.8337012174606323, + "train_runtime": 9258.3008, + "train_samples_per_second": 1.08, + "train_steps_per_second": 0.068 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 508727619158016.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..13695ff226df5a430b650c69efd6adeabb849f6d --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "v_proj", + "down_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3da89ebe1059f2e50e626c9fc0b98bdbce1e8bf8 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e78440e8ad5c216e1bbb491649fb73d8e9423283d6831d816658f013ddea60 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..a041162bf517056ff0271fabae4667a07b114ee8 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d89c8f41acde27e42e9a8a3b4e1df3f1d80d5199b474eab8c9904c815c8b6d0 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..180381c7f91a190e1985a0295e94966b0b2a4bcc --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.7954520844133007, + "learning_rate": 2e-05, + "loss": 1.3544, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8486736733468345, + "learning_rate": 4e-05, + "loss": 1.4258, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7231129434225938, + "learning_rate": 6e-05, + "loss": 1.2565, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6549525568483539, + "learning_rate": 8e-05, + "loss": 1.301, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.6713937782391836, + "learning_rate": 0.0001, + "loss": 1.1783, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7775279811022765, + "learning_rate": 0.00012, + "loss": 1.2018, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8210500566974728, + "learning_rate": 0.00014, + "loss": 1.0322, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.582072473996548, + "learning_rate": 0.00016, + "loss": 1.0719, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5332524469921555, + "learning_rate": 0.00018, + "loss": 0.9937, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.4652344399947701, + "learning_rate": 0.0002, + "loss": 0.9582, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.528479459724629, + "learning_rate": 0.00019999458931878073, + "loss": 1.0025, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.48862298453038183, + "learning_rate": 0.0001999783578606323, + "loss": 1.0045, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4788245246121141, + "learning_rate": 0.00019995130738201966, + "loss": 0.9681, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.44036289873008344, + "learning_rate": 0.0001999134408101731, + "loss": 0.9215, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.5449869090192805, + "learning_rate": 0.00019986476224277165, + "loss": 0.9593, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4237521722332184, + "learning_rate": 0.00019980527694749952, + "loss": 0.8736, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4168713872622836, + "learning_rate": 0.00019973499136147606, + "loss": 0.8863, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.45264982727147834, + "learning_rate": 0.0001996539130905593, + "loss": 0.8696, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3929576698684051, + "learning_rate": 0.0001995620509085228, + "loss": 0.916, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.4415540009570294, + "learning_rate": 0.00019945941475610623, + "loss": 0.9198, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.3812265501303348, + "learning_rate": 0.0001993460157399396, + "loss": 0.8966, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4047740643644689, + "learning_rate": 0.0001992218661313415, + "loss": 0.9509, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.394670597027155, + "learning_rate": 0.00019908697936499103, + "loss": 0.8582, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.40883800080500177, + "learning_rate": 0.00019894137003747403, + "loss": 0.8873, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.4945237937729557, + "learning_rate": 0.00019878505390570362, + "loss": 0.9728, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4621016693300622, + "learning_rate": 0.00019861804788521493, + "loss": 0.9232, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.6732021018661585, + "learning_rate": 0.00019844037004833473, + "loss": 0.8132, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4422468298010891, + "learning_rate": 0.00019825203962222572, + "loss": 0.9266, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.40220173065598824, + "learning_rate": 0.0001980530769868059, + "loss": 0.8905, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.36918169793645084, + "learning_rate": 0.00019784350367254322, + "loss": 0.8733, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5597457386366598, + "learning_rate": 0.0001976233423581255, + "loss": 0.9705, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5595809993867294, + "learning_rate": 0.0001973926168680066, + "loss": 0.9519, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4605324530129531, + "learning_rate": 0.00019715135216982798, + "loss": 0.895, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.42815914241954167, + "learning_rate": 0.0001968995743717171, + "loss": 0.9781, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.4124008245937204, + "learning_rate": 0.00019663731071946206, + "loss": 0.8654, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4388119798381529, + "learning_rate": 0.00019636458959356316, + "loss": 0.9486, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.41876874470531383, + "learning_rate": 0.0001960814405061619, + "loss": 0.8528, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4063934253065074, + "learning_rate": 0.00019578789409784727, + "loss": 0.9519, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4281987478958129, + "learning_rate": 0.00019548398213434007, + "loss": 0.931, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.3995519502824548, + "learning_rate": 0.00019516973750305532, + "loss": 0.8706, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.39938546863455104, + "learning_rate": 0.00019484519420954354, + "loss": 0.9081, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.41849656692135495, + "learning_rate": 0.00019451038737381077, + "loss": 0.9316, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.38704750788751674, + "learning_rate": 0.00019416535322651818, + "loss": 0.9122, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.36713908717782284, + "learning_rate": 0.00019381012910506146, + "loss": 0.8173, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.386332629442072, + "learning_rate": 0.00019344475344953012, + "loss": 0.8054, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.40175758635877834, + "learning_rate": 0.00019306926579854821, + "loss": 0.871, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3980262653132001, + "learning_rate": 0.00019268370678499533, + "loss": 0.864, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4340588852915137, + "learning_rate": 0.0001922881181316097, + "loss": 0.9131, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.424408064468781, + "learning_rate": 0.00019188254264647337, + "loss": 0.8575, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.42662287831550505, + "learning_rate": 0.0001914670242183795, + "loss": 0.8401, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4018922149491412, + "learning_rate": 0.0001910416078120832, + "loss": 0.8477, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4547104194477866, + "learning_rate": 0.0001906063394634356, + "loss": 0.9216, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.46878889945327834, + "learning_rate": 0.00019016126627440237, + "loss": 0.8866, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3901101154924772, + "learning_rate": 0.00018970643640796642, + "loss": 0.8071, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.3786177626239962, + "learning_rate": 0.000189241899082916, + "loss": 0.8155, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.37399747556550167, + "learning_rate": 0.00018876770456851877, + "loss": 0.8673, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4476713311081534, + "learning_rate": 0.0001882839041790818, + "loss": 0.8913, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3695813169482114, + "learning_rate": 0.00018779055026839868, + "loss": 0.8532, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3740852677452882, + "learning_rate": 0.00018728769622408423, + "loss": 0.851, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.37805895790025723, + "learning_rate": 0.00018677539646179707, + "loss": 0.8893, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.48550367409643636, + "learning_rate": 0.00018625370641935129, + "loss": 0.9271, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4261985087419851, + "learning_rate": 0.00018572268255071718, + "loss": 0.9485, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3506821138816665, + "learning_rate": 0.00018518238231991218, + "loss": 0.7993, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.39926954240428475, + "learning_rate": 0.00018463286419478255, + "loss": 0.9094, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.369474669842929, + "learning_rate": 0.00018407418764067627, + "loss": 0.8639, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.38412612507425553, + "learning_rate": 0.00018350641311400812, + "loss": 0.92, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.6236483911477445, + "learning_rate": 0.0001829296020557174, + "loss": 0.9896, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.40709892702353573, + "learning_rate": 0.00018234381688461942, + "loss": 0.7737, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.38384691117683545, + "learning_rate": 0.0001817491209906506, + "loss": 0.8189, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.419002960060546, + "learning_rate": 0.00018114557872800905, + "loss": 0.8892, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4317304658707876, + "learning_rate": 0.00018053325540819045, + "loss": 0.9425, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3605359976609917, + "learning_rate": 0.0001799122172929206, + "loss": 0.8432, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.38387195033721394, + "learning_rate": 0.00017928253158698473, + "loss": 0.8127, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4115773141922566, + "learning_rate": 0.0001786442664309554, + "loss": 0.835, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.41682944760695084, + "learning_rate": 0.0001779974908938184, + "loss": 0.8527, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.44021561768781686, + "learning_rate": 0.0001773422749654988, + "loss": 0.8634, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3788641404846892, + "learning_rate": 0.00017667868954928694, + "loss": 0.8657, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.35062726715426834, + "learning_rate": 0.00017600680645416583, + "loss": 0.776, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.3640559161150146, + "learning_rate": 0.00017532669838704035, + "loss": 0.8121, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.35673860083372294, + "learning_rate": 0.00017463843894486937, + "loss": 0.8013, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.36719254075722085, + "learning_rate": 0.0001739421026067017, + "loss": 0.864, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.39931819574163646, + "learning_rate": 0.00017323776472561627, + "loss": 0.8932, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3982022670626305, + "learning_rate": 0.00017252550152056795, + "loss": 0.8254, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.3997476767736355, + "learning_rate": 0.0001718053900681397, + "loss": 0.8817, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.3463507690011195, + "learning_rate": 0.00017107750829420176, + "loss": 0.8348, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4200196272874818, + "learning_rate": 0.00017034193496547902, + "loss": 0.9056, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.3755782720326349, + "learning_rate": 0.00016959874968102735, + "loss": 0.8419, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3981270184947585, + "learning_rate": 0.00016884803286362, + "loss": 0.8491, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.37218427069658755, + "learning_rate": 0.00016808986575104465, + "loss": 0.9352, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.3395579879759133, + "learning_rate": 0.00016732433038731242, + "loss": 0.7546, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.3673793454918801, + "learning_rate": 0.0001665515096137797, + "loss": 0.833, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.34780686537520417, + "learning_rate": 0.00016577148706018328, + "loss": 0.8021, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.38425962561813065, + "learning_rate": 0.00016498434713559088, + "loss": 0.8679, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.42936546302655365, + "learning_rate": 0.00016419017501926656, + "loss": 0.8377, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.4267179181570991, + "learning_rate": 0.0001633890566514535, + "loss": 0.8357, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.36916905077663476, + "learning_rate": 0.00016258107872407375, + "loss": 0.8232, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.38729223078060027, + "learning_rate": 0.0001617663286713474, + "loss": 0.8608, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4065510182299363, + "learning_rate": 0.00016094489466033043, + "loss": 0.8215, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.35357551053351133, + "learning_rate": 0.00016011686558137448, + "loss": 0.8115, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 1.0001838046966431, + "learning_rate": 0.0001592823310385073, + "loss": 0.8702, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.40193375582947716, + "learning_rate": 0.0001584413813397364, + "loss": 0.8128, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3935235182421146, + "learning_rate": 0.00015759410748727662, + "loss": 0.7955, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.6013285238613454, + "learning_rate": 0.00015674060116770236, + "loss": 0.8225, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3787137979507313, + "learning_rate": 0.00015588095474202595, + "loss": 0.8257, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.38425572159674526, + "learning_rate": 0.00015501526123570277, + "loss": 0.8475, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4006328412662351, + "learning_rate": 0.00015414361432856475, + "loss": 0.8648, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4086678609977253, + "learning_rate": 0.0001532661083446829, + "loss": 0.8863, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4881485864286447, + "learning_rate": 0.00015238283824216015, + "loss": 0.9356, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4025668507825546, + "learning_rate": 0.00015149389960285558, + "loss": 0.8127, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.3710129691466552, + "learning_rate": 0.00015059938862204127, + "loss": 0.8428, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4003824576144442, + "learning_rate": 0.00014969940209799248, + "loss": 0.8425, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5018583001311245, + "learning_rate": 0.00014879403742151283, + "loss": 0.8779, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.36121263099811374, + "learning_rate": 0.00014788339256539544, + "loss": 0.7713, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4251687064188239, + "learning_rate": 0.0001469675660738206, + "loss": 0.7802, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.3569667148399426, + "learning_rate": 0.00014604665705169237, + "loss": 0.8449, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4066427039698594, + "learning_rate": 0.00014512076515391375, + "loss": 0.8426, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.44110171219803324, + "learning_rate": 0.00014418999057460276, + "loss": 0.8606, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.37113662406191406, + "learning_rate": 0.0001432544340362501, + "loss": 0.8036, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.36927548404036525, + "learning_rate": 0.00014231419677881966, + "loss": 0.7892, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.3628801931909292, + "learning_rate": 0.00014136938054879283, + "loss": 0.8945, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3881852305317355, + "learning_rate": 0.00014042008758815818, + "loss": 0.8457, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3493063856843603, + "learning_rate": 0.00013946642062334766, + "loss": 0.8096, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4238637282608415, + "learning_rate": 0.00013850848285411994, + "loss": 0.8239, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.35268751443414337, + "learning_rate": 0.000137546377942393, + "loss": 0.8348, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.39291285524686237, + "learning_rate": 0.00013658021000102636, + "loss": 0.8241, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.34506213306561334, + "learning_rate": 0.00013561008358255468, + "loss": 0.7677, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4203851880192952, + "learning_rate": 0.00013463610366787392, + "loss": 0.8461, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4474038709406084, + "learning_rate": 0.00013365837565488064, + "loss": 0.8768, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3778914426775528, + "learning_rate": 0.0001326770053470668, + "loss": 0.8762, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3583604676991779, + "learning_rate": 0.0001316920989420703, + "loss": 0.749, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3201977553502979, + "learning_rate": 0.00013070376302018287, + "loss": 0.771, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.327874635613636, + "learning_rate": 0.00012971210453281674, + "loss": 0.7247, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.36654127787678314, + "learning_rate": 0.000128717230790931, + "loss": 0.8057, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.37665903879585244, + "learning_rate": 0.00012771924945341906, + "loss": 0.7767, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.4034764628456416, + "learning_rate": 0.00012671826851545851, + "loss": 0.8348, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3907539802126158, + "learning_rate": 0.0001257143962968246, + "loss": 0.8872, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3569846944905879, + "learning_rate": 0.00012470774143016853, + "loss": 0.8112, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.36188126942655424, + "learning_rate": 0.00012369841284926188, + "loss": 0.8019, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.379019802148045, + "learning_rate": 0.00012268651977720866, + "loss": 0.7994, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.41234904215035534, + "learning_rate": 0.00012167217171462566, + "loss": 0.8809, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.38868434007296393, + "learning_rate": 0.0001206554784277931, + "loss": 0.7578, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.350346834606368, + "learning_rate": 0.00011963654993677645, + "loss": 0.8003, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4135182041369236, + "learning_rate": 0.00011861549650352069, + "loss": 0.7891, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3447428610535347, + "learning_rate": 0.00011759242861991855, + "loss": 0.7747, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.3659043229276162, + "learning_rate": 0.00011656745699585371, + "loss": 0.7847, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3627598928666497, + "learning_rate": 0.00011554069254722051, + "loss": 0.8729, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3316910349747776, + "learning_rate": 0.00011451224638392129, + "loss": 0.7812, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3946570922891007, + "learning_rate": 0.00011348222979784289, + "loss": 0.7865, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3359636799382807, + "learning_rate": 0.00011245075425081328, + "loss": 0.7965, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.37137439376427284, + "learning_rate": 0.00011141793136253986, + "loss": 0.849, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3574031642165924, + "learning_rate": 0.0001103838728985307, + "loss": 0.825, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.38556543326751924, + "learning_rate": 0.000109348690758, + "loss": 0.8379, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.38805213476282924, + "learning_rate": 0.00010831249696175918, + "loss": 0.8097, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3574217944877886, + "learning_rate": 0.0001072754036400944, + "loss": 0.7886, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.39271223057544774, + "learning_rate": 0.00010623752302063283, + "loss": 0.8542, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4324299524551468, + "learning_rate": 0.00010519896741619803, + "loss": 0.8882, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.32450275667877587, + "learning_rate": 0.00010415984921265609, + "loss": 0.7915, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.37732713298950105, + "learning_rate": 0.00010312028085675391, + "loss": 0.7915, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3406928749347663, + "learning_rate": 0.00010208037484395114, + "loss": 0.817, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.3751715626743408, + "learning_rate": 0.00010104024370624644, + "loss": 0.8437, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.33757211373718954, + "learning_rate": 0.0001, + "loss": 0.7485, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.46079705496359785, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8788, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.4271610568124271, + "learning_rate": 9.791962515604887e-05, + "loss": 0.8616, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3730244116708226, + "learning_rate": 9.687971914324607e-05, + "loss": 0.713, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.32767683574358164, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7503, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.32244567576195554, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7239, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3397126044271277, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7984, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3012960591552935, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7531, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.37938450909301, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7915, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.38701381132864615, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7544, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.38690509598881456, + "learning_rate": 8.961612710146934e-05, + "loss": 0.8261, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3662178194553636, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7712, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.325465438950959, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7461, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.36372519207365633, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7577, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.39791618344605173, + "learning_rate": 8.548775361607872e-05, + "loss": 0.8952, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.39106877210397273, + "learning_rate": 8.445930745277953e-05, + "loss": 0.8067, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.32165292969442727, + "learning_rate": 8.343254300414628e-05, + "loss": 0.704, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.36154222910864736, + "learning_rate": 8.240757138008149e-05, + "loss": 0.8184, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.42719982445728705, + "learning_rate": 8.138450349647936e-05, + "loss": 0.8165, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.5246656865774878, + "learning_rate": 8.036345006322359e-05, + "loss": 0.9235, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3889988384699533, + "learning_rate": 7.934452157220694e-05, + "loss": 0.8564, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.34293279991160774, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7917, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.43448804859625373, + "learning_rate": 7.731348022279134e-05, + "loss": 0.8397, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.37332421671048194, + "learning_rate": 7.630158715073813e-05, + "loss": 0.847, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.2847408908726539, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7008, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3793630586511894, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8073, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3314978495730383, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7501, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.38192244839668066, + "learning_rate": 7.228075054658096e-05, + "loss": 0.8806, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.35144673379944147, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7983, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.3320272570958794, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7778, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3245100936187699, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7638, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3487129086320174, + "learning_rate": 6.830790105792973e-05, + "loss": 0.771, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.35931606259571913, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7663, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3367390991318867, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7966, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.33183255910992066, + "learning_rate": 6.536389633212609e-05, + "loss": 0.78, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3264528270630262, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7772, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3517378205790495, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7642, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.431338962220006, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7927, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3731631592093385, + "learning_rate": 6.149151714588009e-05, + "loss": 0.8503, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.33909105312590193, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7154, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.36200420260594113, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7569, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4149004455820632, + "learning_rate": 5.863061945120719e-05, + "loss": 0.8286, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.32292850370856624, + "learning_rate": 5.768580322118034e-05, + "loss": 0.8015, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3771935747682809, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.803, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.3541848583253109, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.8545, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.44832715565151354, + "learning_rate": 5.487923484608629e-05, + "loss": 0.8812, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.37472326339826867, + "learning_rate": 5.395334294830765e-05, + "loss": 0.9129, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.35228666839882417, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7832, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3350939709150189, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7491, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.38948109383274765, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7651, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.32094235443298985, + "learning_rate": 5.030059790200756e-05, + "loss": 0.769, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.30333546694103175, + "learning_rate": 4.940061137795876e-05, + "loss": 0.6676, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.36011644742045107, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7948, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4501270149702274, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8075, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.32443270170477734, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7327, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.35437103043306173, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7306, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.32026538320410597, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7387, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3565566202307532, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7722, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3770279902011318, + "learning_rate": 4.325939883229766e-05, + "loss": 0.8262, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.3225427774272655, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7808, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3643305874741898, + "learning_rate": 4.155861866026364e-05, + "loss": 0.6751, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3495365472341216, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7634, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3865725013932627, + "learning_rate": 3.988313441862553e-05, + "loss": 0.8201, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.39371326689707414, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.783, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.3343399455415386, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7558, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.37018368726419704, + "learning_rate": 3.741892127592625e-05, + "loss": 0.8034, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.2917671574400929, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7608, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3149055123541856, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7376, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.48194732886134795, + "learning_rate": 3.501565286440914e-05, + "loss": 0.8543, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.3201762925229535, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7311, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3832969139432672, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.8361, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3591778802980467, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.8246, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.33711687357568193, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7911, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3992050715809478, + "learning_rate": 3.115196713638e-05, + "loss": 0.784, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.3717791655175765, + "learning_rate": 3.040125031897264e-05, + "loss": 0.8544, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3266902771814692, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.758, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.35563021204215095, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7699, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.379714101588036, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.8409, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.47451149824579747, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.8289, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.3188599450071592, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7431, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3537473485993502, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7519, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.35260123842359997, + "learning_rate": 2.536156105513062e-05, + "loss": 0.8052, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3782134918689213, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.8498, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4240797008066036, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8504, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.3493922027357962, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.748, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3109975196467063, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7332, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3490386685549296, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7745, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.36175624431520176, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7388, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4367153475876239, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.8179, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.39966660383957925, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7862, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.39454280119505963, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7943, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.34240338202982135, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7764, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3836219267710939, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7249, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3545049008632808, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.745, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.4782319117565856, + "learning_rate": 1.707039794428259e-05, + "loss": 0.9546, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.30912276688779844, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7098, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3564793276918867, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.7553, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.34525121870000597, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7499, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3349420082155695, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.805, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.433540567668089, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.877, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3801150070687427, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.8694, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3738768456908641, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7737, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3801506817792511, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7256, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.29294064483194054, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7278, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.3574084890505441, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.8017, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.44325322418927066, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.8188, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.33465511889302607, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7404, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.35582143701746155, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.828, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.32908366694518915, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7496, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.3923286422615176, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7605, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.32871379992022076, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7252, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4464878317171815, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7487, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.37431391710968503, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7393, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.31383965433569194, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7417, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.40622594368739234, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.8212, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.312688075077189, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6931, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.2951795837647385, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7062, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3906241449752177, + "learning_rate": 6.189870894938587e-06, + "loss": 0.8502, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.33976393181898756, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7466, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.3645631566569671, + "learning_rate": 5.489612626189245e-06, + "loss": 0.8331, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.32171725161543063, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7301, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3215222867159588, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7803, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3930936220455362, + "learning_rate": 4.516017865659949e-06, + "loss": 0.8015, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4009757739317676, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7702, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.3509902652942518, + "learning_rate": 3.918559493838114e-06, + "loss": 0.8313, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4823527769540213, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7737, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.37189019955946206, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7485, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3929380712488443, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7541, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3230172315650001, + "learning_rate": 2.848647830172024e-06, + "loss": 0.7514, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.331298596116422, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7476, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.41208820254007095, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.8253, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3540797011234827, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7754, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.40367747859127645, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.8121, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.36985043876259127, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7875, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.3757213402313114, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7626, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3519319526157233, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.8127, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3733908580196815, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7547, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.36965768774397506, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7968, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.392672840473129, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7971, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.35145171210243764, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7712, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3557470819008264, + "learning_rate": 6.539842600603918e-07, + "loss": 0.8122, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.39875493221618996, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7222, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.3398025671483795, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7241, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.44734223609044316, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.8064, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.36261141044117756, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7815, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 1.1171234591773465, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7992, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3255664161802146, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7498, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.34989009087457607, + "learning_rate": 8.655918982689581e-08, + "loss": 0.8037, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.33217965843871977, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.8156, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.3603261858382364, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7071, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3558631704357258, + "learning_rate": 5.410681219286673e-09, + "loss": 0.742, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3200726372631764, + "learning_rate": 0.0, + "loss": 0.7306, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 739882583326720.0, + "train_loss": 0.8309416449986972, + "train_runtime": 9163.8195, + "train_samples_per_second": 1.091, + "train_steps_per_second": 0.034 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 739882583326720.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e03b45cf925e222cf06ea1a1800cc06b498db25f --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "gate_proj", + "k_proj", + "o_proj", + "down_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a927f46d45d5d66d44ba52e15be6346557d16204 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807df09410a8bac8441b10edd939efedcbdf49e2206c5d73aed6963a3087c65e +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c15821cbe92871abfc72a15ea48fb61c63f13ec --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a957a55853852e70c4e4559e96f956dcefc8993421eea81bbc558164c8b426a +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8246e58fb9fa0d592b4db2af835e1500da02c9f0 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.8254008371684874, + "learning_rate": 5.263157894736842e-06, + "loss": 1.2336, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.9001692078599441, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.1506, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 1.1122164974039503, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.4268, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8966554613374759, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4789, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.8525456477268124, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.1729, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.7669096680011646, + "learning_rate": 3.157894736842105e-05, + "loss": 1.354, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.7345567162332799, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.2388, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9536980255199109, + "learning_rate": 4.210526315789474e-05, + "loss": 1.4101, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.9415953438208667, + "learning_rate": 4.736842105263158e-05, + "loss": 1.145, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 0.8529883237546517, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.1384, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 1.126915375873681, + "learning_rate": 5.789473684210527e-05, + "loss": 1.2245, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.9167812550021163, + "learning_rate": 6.31578947368421e-05, + "loss": 1.127, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8528831790324976, + "learning_rate": 6.842105263157895e-05, + "loss": 1.0553, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.9532646265952134, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1337, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.880662805694576, + "learning_rate": 7.894736842105263e-05, + "loss": 1.1473, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.9702786952678096, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9314, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.7029363111637975, + "learning_rate": 8.947368421052632e-05, + "loss": 0.9208, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5266694454511093, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8928, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.682516304944438, + "learning_rate": 0.0001, + "loss": 0.9943, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.5953207994894946, + "learning_rate": 0.00010526315789473685, + "loss": 0.8949, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6906825492076013, + "learning_rate": 0.0001105263157894737, + "loss": 1.1288, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.671075887310189, + "learning_rate": 0.00011578947368421053, + "loss": 0.9903, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5100061823856961, + "learning_rate": 0.00012105263157894738, + "loss": 0.8214, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5954574516082615, + "learning_rate": 0.0001263157894736842, + "loss": 0.8619, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.4735325474803045, + "learning_rate": 0.00013157894736842108, + "loss": 0.794, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5792859050154008, + "learning_rate": 0.0001368421052631579, + "loss": 0.8778, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.8063892780047107, + "learning_rate": 0.00014210526315789474, + "loss": 1.0306, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6768891635446834, + "learning_rate": 0.00014736842105263158, + "loss": 1.0523, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.5673538618908092, + "learning_rate": 0.00015263157894736845, + "loss": 0.9252, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.7017082041353666, + "learning_rate": 0.00015789473684210527, + "loss": 0.8449, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.657083612318453, + "learning_rate": 0.0001631578947368421, + "loss": 0.9792, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5097573928562225, + "learning_rate": 0.00016842105263157895, + "loss": 0.8692, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.7351668590114271, + "learning_rate": 0.0001736842105263158, + "loss": 1.0574, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.45662861605853267, + "learning_rate": 0.00017894736842105264, + "loss": 0.8089, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.63199243924169, + "learning_rate": 0.00018421052631578948, + "loss": 0.9736, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5285096956312785, + "learning_rate": 0.00018947368421052632, + "loss": 0.863, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.4865534099301057, + "learning_rate": 0.00019473684210526317, + "loss": 0.8491, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.7465752627954467, + "learning_rate": 0.0002, + "loss": 1.0501, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.6777157931085108, + "learning_rate": 0.00019999966405802826, + "loss": 0.9819, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.48012712623173426, + "learning_rate": 0.00019999865623437013, + "loss": 0.7824, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.7025851138868436, + "learning_rate": 0.00019999697653579705, + "loss": 0.8443, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5804040068154181, + "learning_rate": 0.00019999462497359466, + "loss": 0.9188, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.6174137093451777, + "learning_rate": 0.0001999916015635627, + "loss": 0.8948, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.565237666232095, + "learning_rate": 0.00019998790632601496, + "loss": 0.8635, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.7716355173434238, + "learning_rate": 0.00019998353928577919, + "loss": 0.9928, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.7912711363450475, + "learning_rate": 0.0001999785004721968, + "loss": 1.0359, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.6700910817301969, + "learning_rate": 0.0001999727899191228, + "loss": 0.9516, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.585045306082608, + "learning_rate": 0.00019996640766492543, + "loss": 0.9152, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.5846940869630389, + "learning_rate": 0.00019995935375248606, + "loss": 0.983, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.6064015603095227, + "learning_rate": 0.00019995162822919883, + "loss": 0.9917, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.5600487528206084, + "learning_rate": 0.00019994323114697022, + "loss": 0.9052, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.637614919803038, + "learning_rate": 0.00019993416256221895, + "loss": 0.9627, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.5057824749130014, + "learning_rate": 0.0001999244225358753, + "loss": 0.8209, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.6987392056027488, + "learning_rate": 0.00019991401113338104, + "loss": 0.8004, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.5920113344336018, + "learning_rate": 0.00019990292842468868, + "loss": 0.9805, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6142815169052033, + "learning_rate": 0.00019989117448426108, + "loss": 1.0478, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.5068824296441238, + "learning_rate": 0.0001998787493910712, + "loss": 0.9216, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5597225389448093, + "learning_rate": 0.00019986565322860115, + "loss": 0.8183, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.5763303749670541, + "learning_rate": 0.000199851886084842, + "loss": 0.945, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.5366233300766187, + "learning_rate": 0.00019983744805229296, + "loss": 0.8461, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.6596429498464922, + "learning_rate": 0.00019982233922796085, + "loss": 0.9229, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5976747715695442, + "learning_rate": 0.00019980655971335945, + "loss": 0.9659, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.5164856598070255, + "learning_rate": 0.00019979010961450878, + "loss": 0.8142, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5430531234847793, + "learning_rate": 0.00019977298904193437, + "loss": 0.8592, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.5091259024865504, + "learning_rate": 0.00019975519811066663, + "loss": 0.8416, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.46998804920242443, + "learning_rate": 0.00019973673694024, + "loss": 0.8344, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.5619349448714933, + "learning_rate": 0.0001997176056546921, + "loss": 0.9138, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5929775120851979, + "learning_rate": 0.00019969780438256293, + "loss": 0.892, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.552548233857106, + "learning_rate": 0.0001996773332568941, + "loss": 0.9087, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.860017864083892, + "learning_rate": 0.0001996561924152278, + "loss": 0.9755, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.4914721627776415, + "learning_rate": 0.00019963438199960599, + "loss": 0.8207, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4573685117193804, + "learning_rate": 0.0001996119021565693, + "loss": 0.7282, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.5094000388508044, + "learning_rate": 0.00019958875303715615, + "loss": 0.9036, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.7188280395095236, + "learning_rate": 0.0001995649347969019, + "loss": 1.0293, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.5685404562203534, + "learning_rate": 0.0001995404475958373, + "loss": 0.9022, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.56915513588708, + "learning_rate": 0.00019951529159848805, + "loss": 0.9357, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.6290734267531265, + "learning_rate": 0.0001994894669738732, + "loss": 0.9204, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.5231699704733803, + "learning_rate": 0.00019946297389550433, + "loss": 0.8465, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.552839164836695, + "learning_rate": 0.0001994358125413841, + "loss": 0.8066, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.5533739578556125, + "learning_rate": 0.00019940798309400526, + "loss": 0.8253, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.5441874080378364, + "learning_rate": 0.0001993794857403495, + "loss": 0.8499, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5783011755944921, + "learning_rate": 0.0001993503206718859, + "loss": 0.8391, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.5227658869368782, + "learning_rate": 0.0001993204880845699, + "loss": 0.8002, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.6353336941951139, + "learning_rate": 0.00019928998817884182, + "loss": 0.952, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.4849015113603158, + "learning_rate": 0.00019925882115962568, + "loss": 0.8252, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5612381543734479, + "learning_rate": 0.00019922698723632767, + "loss": 0.9493, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.49033130992014856, + "learning_rate": 0.00019919448662283478, + "loss": 0.8418, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6398349798953992, + "learning_rate": 0.00019916131953751342, + "loss": 1.0504, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.5526787869064176, + "learning_rate": 0.00019912748620320794, + "loss": 0.877, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.5521455522404153, + "learning_rate": 0.00019909298684723904, + "loss": 0.8892, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.519755619878471, + "learning_rate": 0.00019905782170140238, + "loss": 0.899, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.46629053718261354, + "learning_rate": 0.00019902199100196697, + "loss": 0.7694, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.6171050549681674, + "learning_rate": 0.00019898549498967343, + "loss": 0.9466, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.49479909567521657, + "learning_rate": 0.00019894833390973266, + "loss": 0.8541, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.5780972034978296, + "learning_rate": 0.000198910508011824, + "loss": 0.9026, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.580664933003876, + "learning_rate": 0.00019887201755009357, + "loss": 0.9675, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.6179081879462358, + "learning_rate": 0.00019883286278315262, + "loss": 0.8656, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.6391035686987561, + "learning_rate": 0.0001987930439740757, + "loss": 0.9212, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.5877945457338969, + "learning_rate": 0.00019875256139039902, + "loss": 0.9321, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.44098932185732936, + "learning_rate": 0.00019871141530411853, + "loss": 0.7673, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.6794514741827338, + "learning_rate": 0.00019866960599168826, + "loss": 0.923, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5650947115406529, + "learning_rate": 0.0001986271337340182, + "loss": 0.945, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.5554518672223475, + "learning_rate": 0.0001985839988164726, + "loss": 0.895, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6049279312276732, + "learning_rate": 0.00019854020152886814, + "loss": 0.9053, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.7746043824252926, + "learning_rate": 0.00019849574216547171, + "loss": 0.9953, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.6332061215183638, + "learning_rate": 0.0001984506210249986, + "loss": 0.9304, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.5108607839446464, + "learning_rate": 0.00019840483841061058, + "loss": 0.8559, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.6926921479668185, + "learning_rate": 0.00019835839462991361, + "loss": 0.9331, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.6264027420575131, + "learning_rate": 0.00019831128999495606, + "loss": 0.9285, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.6330186637950089, + "learning_rate": 0.00019826352482222638, + "loss": 0.791, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.6052528890139297, + "learning_rate": 0.0001982150994326511, + "loss": 0.9436, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.6075773668650208, + "learning_rate": 0.00019816601415159263, + "loss": 0.9489, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.47079173468326774, + "learning_rate": 0.0001981162693088471, + "loss": 0.8329, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5974572151753968, + "learning_rate": 0.0001980658652386421, + "loss": 0.8849, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.6647289755145273, + "learning_rate": 0.0001980148022796345, + "loss": 0.8816, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.5682715106131008, + "learning_rate": 0.00019796308077490817, + "loss": 0.8941, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.482152650595725, + "learning_rate": 0.00019791070107197153, + "loss": 0.8777, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.5165789823618901, + "learning_rate": 0.00019785766352275542, + "loss": 0.941, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.5206375165586081, + "learning_rate": 0.0001978039684836106, + "loss": 0.848, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.5473354335363099, + "learning_rate": 0.00019774961631530545, + "loss": 0.8837, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.5867533232230686, + "learning_rate": 0.0001976946073830234, + "loss": 0.8873, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.5004520879149664, + "learning_rate": 0.00019763894205636072, + "loss": 0.9032, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.5660056802785558, + "learning_rate": 0.00019758262070932375, + "loss": 0.8528, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.47198566166403494, + "learning_rate": 0.00019752564372032657, + "loss": 0.8371, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.531985413922441, + "learning_rate": 0.00019746801147218842, + "loss": 0.9518, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.6590497176544988, + "learning_rate": 0.00019740972435213115, + "loss": 0.9315, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.5399341149414711, + "learning_rate": 0.00019735078275177654, + "loss": 0.8771, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.7152022612823268, + "learning_rate": 0.00019729118706714375, + "loss": 0.9737, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.5632111210435217, + "learning_rate": 0.00019723093769864663, + "loss": 0.9494, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.6494432815425615, + "learning_rate": 0.00019717003505109095, + "loss": 0.9395, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.645820404697124, + "learning_rate": 0.0001971084795336719, + "loss": 1.0063, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.6386019532827905, + "learning_rate": 0.00019704627155997108, + "loss": 0.8572, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.7541722440377103, + "learning_rate": 0.00019698341154795389, + "loss": 0.9837, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.7130713099133933, + "learning_rate": 0.00019691989991996663, + "loss": 0.9495, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 1.0102407806146998, + "learning_rate": 0.00019685573710273376, + "loss": 1.1071, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.6399714005772517, + "learning_rate": 0.0001967909235273549, + "loss": 1.0277, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.5520647074833175, + "learning_rate": 0.00019672545962930215, + "loss": 0.8267, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.6811703160571222, + "learning_rate": 0.00019665934584841682, + "loss": 0.9764, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.4951819776586182, + "learning_rate": 0.00019659258262890683, + "loss": 0.8814, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.7252144114525896, + "learning_rate": 0.00019652517041934356, + "loss": 0.8994, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.6231396634246661, + "learning_rate": 0.00019645710967265882, + "loss": 0.9026, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5406149720059259, + "learning_rate": 0.00019638840084614182, + "loss": 0.8788, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.59254392448039, + "learning_rate": 0.00019631904440143612, + "loss": 0.8767, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.581699642170081, + "learning_rate": 0.00019624904080453655, + "loss": 0.9353, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.5105455362571412, + "learning_rate": 0.00019617839052578603, + "loss": 0.903, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.5100626366765849, + "learning_rate": 0.00019610709403987246, + "loss": 0.8864, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.5844083492003826, + "learning_rate": 0.0001960351518258255, + "loss": 0.9265, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5203118186526728, + "learning_rate": 0.00019596256436701324, + "loss": 0.8405, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.567099504639143, + "learning_rate": 0.00019588933215113926, + "loss": 0.9201, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.600935593665811, + "learning_rate": 0.000195815455670239, + "loss": 0.7433, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.6859220255955671, + "learning_rate": 0.00019574093542067673, + "loss": 1.0724, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.7133179044607507, + "learning_rate": 0.00019566577190314197, + "loss": 0.8981, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.4567081431598064, + "learning_rate": 0.0001955899656226464, + "loss": 0.8523, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.6231102612177877, + "learning_rate": 0.0001955135170885202, + "loss": 0.8895, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.5222674271560935, + "learning_rate": 0.0001954364268144088, + "loss": 0.8027, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5662569267762858, + "learning_rate": 0.00019535869531826937, + "loss": 0.8573, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.4781599460417116, + "learning_rate": 0.00019528032312236736, + "loss": 0.8367, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.7033651527431625, + "learning_rate": 0.00019520131075327298, + "loss": 0.9358, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.8097342224747683, + "learning_rate": 0.00019512165874185767, + "loss": 1.0317, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.5806969757982609, + "learning_rate": 0.00019504136762329047, + "loss": 0.8729, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.5303743310414841, + "learning_rate": 0.0001949604379370345, + "loss": 0.8312, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4256547422428516, + "learning_rate": 0.00019487887022684336, + "loss": 0.7859, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.4653065719768864, + "learning_rate": 0.00019479666504075736, + "loss": 0.8421, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4996515574568142, + "learning_rate": 0.00019471382293110003, + "loss": 0.7827, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.5801231538350596, + "learning_rate": 0.0001946303444544741, + "loss": 0.9412, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.7326758199494244, + "learning_rate": 0.00019454623017175812, + "loss": 0.8973, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.5444005275625907, + "learning_rate": 0.00019446148064810242, + "loss": 0.7796, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5803129383288371, + "learning_rate": 0.00019437609645292546, + "loss": 0.8867, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.5135925841351157, + "learning_rate": 0.00019429007815990993, + "loss": 0.8538, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.6383901031960644, + "learning_rate": 0.0001942034263469989, + "loss": 0.7942, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.6122442084128001, + "learning_rate": 0.00019411614159639204, + "loss": 0.9368, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.6087794842053829, + "learning_rate": 0.00019402822449454153, + "loss": 0.8404, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.5564528142757547, + "learning_rate": 0.00019393967563214833, + "loss": 0.8325, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.4850567265061148, + "learning_rate": 0.00019385049560415794, + "loss": 0.8228, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.5221162064849709, + "learning_rate": 0.00019376068500975667, + "loss": 0.8094, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5151874566600034, + "learning_rate": 0.00019367024445236754, + "loss": 0.8858, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.4878709551254769, + "learning_rate": 0.000193579174539646, + "loss": 0.787, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.7233030595933002, + "learning_rate": 0.00019348747588347637, + "loss": 0.9651, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.5130775448030487, + "learning_rate": 0.00019339514909996706, + "loss": 0.8825, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.44839345672476477, + "learning_rate": 0.00019330219480944694, + "loss": 0.8117, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.5883691759941093, + "learning_rate": 0.00019320861363646095, + "loss": 0.8219, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.5423548373179294, + "learning_rate": 0.00019311440620976597, + "loss": 0.896, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.5423744889933809, + "learning_rate": 0.00019301957316232658, + "loss": 0.9858, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5418330307570707, + "learning_rate": 0.0001929241151313108, + "loss": 0.8639, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.6298302748895126, + "learning_rate": 0.0001928280327580858, + "loss": 0.8705, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.49816030141794515, + "learning_rate": 0.00019273132668821364, + "loss": 0.804, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.5939853111214843, + "learning_rate": 0.00019263399757144683, + "loss": 0.9174, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.48692067227331565, + "learning_rate": 0.00019253604606172417, + "loss": 0.8227, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.5617378319201108, + "learning_rate": 0.000192437472817166, + "loss": 0.8516, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.5479004944647015, + "learning_rate": 0.00019233827850007027, + "loss": 0.8182, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.5583539591117561, + "learning_rate": 0.00019223846377690754, + "loss": 0.8833, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.628958491801015, + "learning_rate": 0.00019213802931831696, + "loss": 0.967, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.527302060387629, + "learning_rate": 0.00019203697579910154, + "loss": 0.8472, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.6949486987354203, + "learning_rate": 0.00019193530389822363, + "loss": 0.9665, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.459939602241374, + "learning_rate": 0.00019183301429880043, + "loss": 0.7723, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.48937827881685525, + "learning_rate": 0.00019173010768809933, + "loss": 0.751, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.5767992998755989, + "learning_rate": 0.00019162658475753327, + "loss": 0.84, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4694125748671859, + "learning_rate": 0.0001915224462026563, + "loss": 0.8372, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.6001006685725776, + "learning_rate": 0.00019141769272315858, + "loss": 0.9252, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.5144273620084883, + "learning_rate": 0.00019131232502286188, + "loss": 0.9258, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.43796251272775827, + "learning_rate": 0.00019120634380971496, + "loss": 0.7584, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.5936578788134119, + "learning_rate": 0.0001910997497957885, + "loss": 0.9065, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.5598625984686376, + "learning_rate": 0.0001909925436972706, + "loss": 0.8307, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5443220379494453, + "learning_rate": 0.00019088472623446183, + "loss": 0.9247, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.8184433595636279, + "learning_rate": 0.00019077629813177036, + "loss": 1.0944, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.6500568742123364, + "learning_rate": 0.00019066726011770726, + "loss": 0.9583, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.5160545784531383, + "learning_rate": 0.00019055761292488142, + "loss": 0.8555, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5811791862451305, + "learning_rate": 0.0001904473572899947, + "loss": 0.921, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.5369674097145263, + "learning_rate": 0.00019033649395383702, + "loss": 0.8913, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.5059613450805891, + "learning_rate": 0.00019022502366128135, + "loss": 0.7337, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.6116792895453856, + "learning_rate": 0.00019011294716127867, + "loss": 0.8973, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.6389591277804562, + "learning_rate": 0.00019000026520685302, + "loss": 0.9158, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.4871039962986681, + "learning_rate": 0.0001898869785550963, + "loss": 0.8081, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.49726856016557125, + "learning_rate": 0.0001897730879671634, + "loss": 0.8494, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.47202207333137924, + "learning_rate": 0.00018965859420826684, + "loss": 0.7875, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5000434141921354, + "learning_rate": 0.00018954349804767184, + "loss": 0.8727, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.6404336077945951, + "learning_rate": 0.00018942780025869098, + "loss": 0.9935, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.7932723905343942, + "learning_rate": 0.00018931150161867916, + "loss": 0.8474, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.5249879319383836, + "learning_rate": 0.00018919460290902826, + "loss": 0.8568, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.6846471757505506, + "learning_rate": 0.00018907710491516199, + "loss": 0.9763, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.6000463340587131, + "learning_rate": 0.0001889590084265304, + "loss": 0.8808, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.7270437314662276, + "learning_rate": 0.0001888403142366049, + "loss": 0.9453, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.7129826804040741, + "learning_rate": 0.0001887210231428727, + "loss": 0.8448, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5044831020418066, + "learning_rate": 0.00018860113594683148, + "loss": 0.8267, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.5375845762581131, + "learning_rate": 0.0001884806534539841, + "loss": 0.9102, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.4715329765658659, + "learning_rate": 0.00018835957647383303, + "loss": 0.8088, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.603833247123497, + "learning_rate": 0.0001882379058198751, + "loss": 0.8914, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.6322228261977552, + "learning_rate": 0.00018811564230959588, + "loss": 0.8906, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.6331290721160587, + "learning_rate": 0.00018799278676446423, + "loss": 0.8661, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.5569009553200088, + "learning_rate": 0.00018786934000992688, + "loss": 0.8732, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.6660678270910944, + "learning_rate": 0.00018774530287540278, + "loss": 0.7198, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.6353279140108365, + "learning_rate": 0.00018762067619427746, + "loss": 0.8603, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.46124480588647165, + "learning_rate": 0.00018749546080389757, + "loss": 0.727, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5928128178155065, + "learning_rate": 0.00018736965754556528, + "loss": 0.8627, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.5546350490698052, + "learning_rate": 0.00018724326726453244, + "loss": 0.9084, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.7639344549876476, + "learning_rate": 0.00018711629080999504, + "loss": 1.0281, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.6056069204561181, + "learning_rate": 0.00018698872903508755, + "loss": 0.952, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.447404951110955, + "learning_rate": 0.00018686058279687698, + "loss": 0.7797, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.4505850429200494, + "learning_rate": 0.0001867318529563574, + "loss": 0.7605, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.528153338217792, + "learning_rate": 0.00018660254037844388, + "loss": 0.8871, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.4960386278474408, + "learning_rate": 0.00018647264593196688, + "loss": 0.8679, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 1.0266285355191271, + "learning_rate": 0.00018634217048966637, + "loss": 0.8459, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.5261590632654706, + "learning_rate": 0.00018621111492818585, + "loss": 0.8645, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5402321028707199, + "learning_rate": 0.0001860794801280666, + "loss": 0.8604, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.4688281180091872, + "learning_rate": 0.00018594726697374175, + "loss": 0.8419, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.47272540628058707, + "learning_rate": 0.0001858144763535302, + "loss": 0.7914, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.6073714592230718, + "learning_rate": 0.0001856811091596308, + "loss": 0.8612, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.40992042568299897, + "learning_rate": 0.0001855471662881164, + "loss": 0.7814, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.5171420488179562, + "learning_rate": 0.00018541264863892754, + "loss": 0.8596, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.5810998337097157, + "learning_rate": 0.00018527755711586678, + "loss": 0.9191, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.5016026004732917, + "learning_rate": 0.00018514189262659235, + "loss": 0.7857, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5290998464684912, + "learning_rate": 0.00018500565608261214, + "loss": 0.8011, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.47571676983361944, + "learning_rate": 0.00018486884839927768, + "loss": 0.7898, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5693979765389368, + "learning_rate": 0.00018473147049577774, + "loss": 0.9489, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.49758148747256714, + "learning_rate": 0.0001845935232951325, + "loss": 0.7966, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.6590744659282638, + "learning_rate": 0.00018445500772418697, + "loss": 1.001, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.6070782655473008, + "learning_rate": 0.00018431592471360503, + "loss": 0.9436, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4765173100361963, + "learning_rate": 0.00018417627519786315, + "loss": 0.7486, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.5831060410544467, + "learning_rate": 0.000184036060115244, + "loss": 0.8634, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.48581764512842424, + "learning_rate": 0.00018389528040783012, + "loss": 0.8026, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.5695229798504049, + "learning_rate": 0.00018375393702149787, + "loss": 0.8629, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.6269406052489828, + "learning_rate": 0.00018361203090591071, + "loss": 0.9387, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.5365713230097443, + "learning_rate": 0.00018346956301451304, + "loss": 0.8277, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5428528320354593, + "learning_rate": 0.00018332653430452376, + "loss": 0.9152, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.7005906897335021, + "learning_rate": 0.00018318294573692985, + "loss": 1.0014, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.6246304282232839, + "learning_rate": 0.00018303879827647975, + "loss": 0.8466, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.5813641119949418, + "learning_rate": 0.0001828940928916772, + "loss": 0.9221, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.5936386939998907, + "learning_rate": 0.00018274883055477436, + "loss": 0.8933, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.6682132049959943, + "learning_rate": 0.00018260301224176558, + "loss": 1.012, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.5003392083890684, + "learning_rate": 0.00018245663893238075, + "loss": 0.8073, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.4823222570626458, + "learning_rate": 0.00018230971161007853, + "loss": 0.8353, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4836212855322634, + "learning_rate": 0.00018216223126204007, + "loss": 0.6703, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.570778634672041, + "learning_rate": 0.00018201419887916214, + "loss": 0.9101, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.7716364793392765, + "learning_rate": 0.00018186561545605054, + "loss": 1.0361, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.5747014332332541, + "learning_rate": 0.00018171648199101346, + "loss": 0.8027, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.459727165390823, + "learning_rate": 0.00018156679948605467, + "loss": 0.7691, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.5631044109219594, + "learning_rate": 0.00018141656894686689, + "loss": 0.8622, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.6170736779394916, + "learning_rate": 0.00018126579138282503, + "loss": 0.9833, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.5320835110335193, + "learning_rate": 0.00018111446780697929, + "loss": 0.8726, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.7106108656925253, + "learning_rate": 0.0001809625992360485, + "loss": 0.8326, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.5699051480731491, + "learning_rate": 0.00018081018669041324, + "loss": 0.9784, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.565038805973298, + "learning_rate": 0.00018065723119410884, + "loss": 0.7859, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.5309404108508301, + "learning_rate": 0.00018050373377481878, + "loss": 0.8548, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.49634103976736577, + "learning_rate": 0.00018034969546386757, + "loss": 0.7807, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.4211605885290296, + "learning_rate": 0.0001801951172962139, + "loss": 0.7503, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.5065282499152559, + "learning_rate": 0.0001800400003104436, + "loss": 0.8007, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.5023885538954765, + "learning_rate": 0.0001798843455487629, + "loss": 0.848, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5619787151600203, + "learning_rate": 0.00017972815405699103, + "loss": 0.9279, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.6090409508083451, + "learning_rate": 0.00017957142688455362, + "loss": 0.882, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.5815566266550105, + "learning_rate": 0.00017941416508447536, + "loss": 0.9173, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.5107579093746113, + "learning_rate": 0.00017925636971337304, + "loss": 0.8693, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5525656058505722, + "learning_rate": 0.0001790980418314484, + "loss": 0.9301, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.5387814059324538, + "learning_rate": 0.00017893918250248104, + "loss": 0.9038, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5241245952403185, + "learning_rate": 0.00017877979279382135, + "loss": 0.8477, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.5922467143505036, + "learning_rate": 0.00017861987377638312, + "loss": 0.826, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4164260795172474, + "learning_rate": 0.0001784594265246366, + "loss": 0.7151, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.6322872381363032, + "learning_rate": 0.0001782984521166011, + "loss": 0.9421, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.6822273627206625, + "learning_rate": 0.0001781369516338378, + "loss": 1.056, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.5456287789046688, + "learning_rate": 0.00017797492616144256, + "loss": 0.8608, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.6332916232764236, + "learning_rate": 0.00017781237678803847, + "loss": 0.8991, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.48183332292100656, + "learning_rate": 0.00017764930460576866, + "loss": 0.7591, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.5164293419792554, + "learning_rate": 0.000177485710710289, + "loss": 0.7607, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.6776115879178484, + "learning_rate": 0.00017732159620076053, + "loss": 0.9309, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.41893179244042816, + "learning_rate": 0.00017715696217984235, + "loss": 0.8024, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.4524838840827557, + "learning_rate": 0.00017699180975368396, + "loss": 0.7879, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.504512145895528, + "learning_rate": 0.00017682614003191807, + "loss": 0.7544, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.5604924118321215, + "learning_rate": 0.00017665995412765285, + "loss": 0.8203, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5185653394154404, + "learning_rate": 0.00017649325315746478, + "loss": 0.7878, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.7509584334458808, + "learning_rate": 0.00017632603824139085, + "loss": 1.1157, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.45356992265873464, + "learning_rate": 0.0001761583105029213, + "loss": 0.744, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.6062594546758928, + "learning_rate": 0.0001759900710689918, + "loss": 0.9007, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.459860351968077, + "learning_rate": 0.00017582132106997616, + "loss": 0.7948, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.4639688266152528, + "learning_rate": 0.00017565206163967846, + "loss": 0.7786, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4997818630302224, + "learning_rate": 0.00017548229391532572, + "loss": 0.8903, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.6224593209313116, + "learning_rate": 0.00017531201903755994, + "loss": 0.8202, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4552829264775281, + "learning_rate": 0.00017514123815043074, + "loss": 0.8474, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.5088192201038029, + "learning_rate": 0.00017496995240138744, + "loss": 0.7795, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.49495732607506265, + "learning_rate": 0.00017479816294127152, + "loss": 0.807, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.5428676966586734, + "learning_rate": 0.00017462587092430875, + "loss": 0.9178, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.518535517881662, + "learning_rate": 0.0001744530775081015, + "loss": 0.7665, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.6454046836263292, + "learning_rate": 0.00017427978385362112, + "loss": 0.9844, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.47639684324202214, + "learning_rate": 0.0001741059911251997, + "loss": 0.8325, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.5120781918332273, + "learning_rate": 0.0001739317004905227, + "loss": 0.8637, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5079393333553627, + "learning_rate": 0.000173756913120621, + "loss": 0.8289, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.4542924014395214, + "learning_rate": 0.00017358163018986282, + "loss": 0.8024, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.578089965059899, + "learning_rate": 0.00017340585287594604, + "loss": 0.8754, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.5061039518358071, + "learning_rate": 0.00017322958235989016, + "loss": 0.8667, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5103291942577219, + "learning_rate": 0.0001730528198260285, + "loss": 0.7871, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.5008286477772609, + "learning_rate": 0.00017287556646200018, + "loss": 0.8479, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.5921120659918616, + "learning_rate": 0.00017269782345874203, + "loss": 0.8689, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.5371446072013113, + "learning_rate": 0.00017251959201048083, + "loss": 0.9321, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.8025587927358921, + "learning_rate": 0.00017234087331472497, + "loss": 1.092, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.5290978362007965, + "learning_rate": 0.00017216166857225674, + "loss": 0.8358, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4750956587014473, + "learning_rate": 0.00017198197898712404, + "loss": 0.8325, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.5429519797737313, + "learning_rate": 0.00017180180576663228, + "loss": 0.7595, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5443289856538186, + "learning_rate": 0.00017162115012133643, + "loss": 0.862, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.477175604369523, + "learning_rate": 0.00017144001326503273, + "loss": 0.7886, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4567904137938678, + "learning_rate": 0.00017125839641475072, + "loss": 0.7842, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.5370099874742391, + "learning_rate": 0.00017107630079074478, + "loss": 0.7789, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.6116928652479303, + "learning_rate": 0.00017089372761648616, + "loss": 0.7751, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.6534658855619526, + "learning_rate": 0.00017071067811865476, + "loss": 0.9485, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.5267548915654874, + "learning_rate": 0.00017052715352713075, + "loss": 0.7644, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.546158562900475, + "learning_rate": 0.00017034315507498635, + "loss": 0.8842, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.49734406840969075, + "learning_rate": 0.00017015868399847768, + "loss": 0.8491, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.503221592285957, + "learning_rate": 0.00016997374153703625, + "loss": 0.7329, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.6020286777353191, + "learning_rate": 0.00016978832893326074, + "loss": 0.8598, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.6425221403272974, + "learning_rate": 0.00016960244743290868, + "loss": 0.9093, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4992867409454385, + "learning_rate": 0.00016941609828488807, + "loss": 0.8603, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.4975241441241856, + "learning_rate": 0.00016922928274124886, + "loss": 0.8822, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.5445502485433601, + "learning_rate": 0.0001690420020571747, + "loss": 0.9137, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.4500183589656974, + "learning_rate": 0.00016885425749097444, + "loss": 0.7316, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.46361537222619187, + "learning_rate": 0.0001686660503040737, + "loss": 0.7427, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.5617104806372842, + "learning_rate": 0.00016847738176100632, + "loss": 0.9108, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.5059886172049418, + "learning_rate": 0.00016828825312940592, + "loss": 0.8165, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.45585084441524354, + "learning_rate": 0.0001680986656799975, + "loss": 0.8853, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.6680443221863649, + "learning_rate": 0.0001679086206865886, + "loss": 0.9626, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.5145275675086313, + "learning_rate": 0.00016771811942606108, + "loss": 0.8972, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4111239629894829, + "learning_rate": 0.00016752716317836229, + "loss": 0.6526, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.4511587296844328, + "learning_rate": 0.00016733575322649657, + "loss": 0.7845, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.42424116750708446, + "learning_rate": 0.0001671438908565167, + "loss": 0.765, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.4912814335522475, + "learning_rate": 0.00016695157735751513, + "loss": 0.769, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.3983467674115582, + "learning_rate": 0.00016675881402161536, + "loss": 0.7489, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.5625653119972895, + "learning_rate": 0.0001665656021439633, + "loss": 0.7935, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5552548657685277, + "learning_rate": 0.0001663719430227186, + "loss": 0.8309, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.4300047148160572, + "learning_rate": 0.00016617783795904565, + "loss": 0.7869, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.5817896749539205, + "learning_rate": 0.00016598328825710533, + "loss": 0.8336, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.7112047804693892, + "learning_rate": 0.00016578829522404583, + "loss": 0.9238, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5150828670492442, + "learning_rate": 0.000165592860169994, + "loss": 0.8975, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.4782176505007454, + "learning_rate": 0.00016539698440804661, + "loss": 0.768, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.4916380083078038, + "learning_rate": 0.00016520066925426144, + "loss": 0.7919, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.4919559785281054, + "learning_rate": 0.0001650039160276485, + "loss": 0.7459, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.428628949999772, + "learning_rate": 0.0001648067260501611, + "loss": 0.7457, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.563634981100943, + "learning_rate": 0.0001646091006466871, + "loss": 0.8152, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.5402069948290369, + "learning_rate": 0.0001644110411450398, + "loss": 0.8678, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.4399607273299524, + "learning_rate": 0.00016421254887594917, + "loss": 0.7285, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.6242587539736574, + "learning_rate": 0.00016401362517305296, + "loss": 0.9364, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.562721845396249, + "learning_rate": 0.00016381427137288754, + "loss": 0.8331, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5214076502423438, + "learning_rate": 0.00016361448881487914, + "loss": 0.8743, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.476645637993095, + "learning_rate": 0.0001634142788413346, + "loss": 0.7904, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.5193954849687584, + "learning_rate": 0.00016321364279743266, + "loss": 0.8551, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.5462745527959467, + "learning_rate": 0.00016301258203121462, + "loss": 0.9005, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.42695216471586, + "learning_rate": 0.0001628110978935756, + "loss": 0.726, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.5022640697248626, + "learning_rate": 0.00016260919173825508, + "loss": 0.7257, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4812952467049974, + "learning_rate": 0.00016240686492182804, + "loss": 0.8171, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.42488386188997473, + "learning_rate": 0.00016220411880369601, + "loss": 0.7608, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.5808555818663869, + "learning_rate": 0.00016200095474607753, + "loss": 0.8605, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.5665458402725879, + "learning_rate": 0.00016179737411399926, + "loss": 0.9286, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.48113860245205914, + "learning_rate": 0.00016159337827528685, + "loss": 0.7679, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.5777103511288991, + "learning_rate": 0.00016138896860055555, + "loss": 0.9326, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.48625375995616793, + "learning_rate": 0.0001611841464632011, + "loss": 0.8427, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.4386903291683638, + "learning_rate": 0.00016097891323939062, + "loss": 0.7884, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.46327681828111467, + "learning_rate": 0.0001607732703080532, + "loss": 0.8286, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.5601019486968891, + "learning_rate": 0.00016056721905087056, + "loss": 0.8159, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.4301418905400167, + "learning_rate": 0.00016036076085226814, + "loss": 0.7878, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.5026133069983575, + "learning_rate": 0.00016015389709940538, + "loss": 0.8828, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5299503679061817, + "learning_rate": 0.0001599466291821666, + "loss": 0.8348, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.581319730296141, + "learning_rate": 0.0001597389584931517, + "loss": 0.8024, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.5946632338435576, + "learning_rate": 0.0001595308864276666, + "loss": 0.8225, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.4841597405281517, + "learning_rate": 0.0001593224143837142, + "loss": 0.8143, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.5492608521995358, + "learning_rate": 0.0001591135437619847, + "loss": 0.7956, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.45387668461942016, + "learning_rate": 0.00015890427596584617, + "loss": 0.7372, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.5297681314438389, + "learning_rate": 0.0001586946124013354, + "loss": 0.8176, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.5237874007801877, + "learning_rate": 0.00015848455447714822, + "loss": 0.8855, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4203172607528882, + "learning_rate": 0.0001582741036046301, + "loss": 0.7114, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.5071416412021843, + "learning_rate": 0.00015806326119776663, + "loss": 0.8688, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.46162030906175094, + "learning_rate": 0.00015785202867317407, + "loss": 0.7411, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.4894583703431277, + "learning_rate": 0.00015764040745008988, + "loss": 0.801, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4711268262898016, + "learning_rate": 0.00015742839895036305, + "loss": 0.7989, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.6222170391946266, + "learning_rate": 0.00015721600459844468, + "loss": 0.8928, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.5537514579365937, + "learning_rate": 0.00015700322582137827, + "loss": 0.8917, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.6053596812657615, + "learning_rate": 0.00015679006404879033, + "loss": 0.9436, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5368770833791756, + "learning_rate": 0.0001565765207128805, + "loss": 0.8546, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.6557971237633967, + "learning_rate": 0.00015636259724841222, + "loss": 0.9259, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.416288866882269, + "learning_rate": 0.0001561482950927029, + "loss": 0.7353, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.5320282584159587, + "learning_rate": 0.00015593361568561428, + "loss": 0.9659, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.541563183445848, + "learning_rate": 0.00015571856046954285, + "loss": 0.8598, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.573944781458947, + "learning_rate": 0.0001555031308894101, + "loss": 0.8327, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.48636608196368486, + "learning_rate": 0.00015528732839265272, + "loss": 0.7427, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.6118686890019372, + "learning_rate": 0.0001550711544292131, + "loss": 0.8806, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.526158680155827, + "learning_rate": 0.0001548546104515294, + "loss": 0.8303, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.585820456566811, + "learning_rate": 0.00015463769791452574, + "loss": 0.8888, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5641310017175182, + "learning_rate": 0.00015442041827560274, + "loss": 0.8957, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.5515760176220771, + "learning_rate": 0.00015420277299462736, + "loss": 0.8979, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.6231208105999786, + "learning_rate": 0.00015398476353392323, + "loss": 1.0441, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.7365207038290411, + "learning_rate": 0.00015376639135826107, + "loss": 0.9749, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4273855647133252, + "learning_rate": 0.00015354765793484834, + "loss": 0.7737, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.5059365066578263, + "learning_rate": 0.00015332856473331978, + "loss": 0.8877, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.503614973058092, + "learning_rate": 0.00015310911322572753, + "loss": 0.7642, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.6730166765511943, + "learning_rate": 0.00015288930488653094, + "loss": 0.9213, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.46396620660917376, + "learning_rate": 0.000152669141192587, + "loss": 0.7674, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.5034901297109798, + "learning_rate": 0.0001524486236231402, + "loss": 0.8469, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5123756459992598, + "learning_rate": 0.00015222775365981273, + "loss": 0.8319, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.5387604003324437, + "learning_rate": 0.00015200653278659432, + "loss": 0.9073, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4725307584661988, + "learning_rate": 0.00015178496248983254, + "loss": 0.7775, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.4878781719567979, + "learning_rate": 0.00015156304425822267, + "loss": 0.7366, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.455230826737341, + "learning_rate": 0.00015134077958279765, + "loss": 0.8115, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.5261328242485291, + "learning_rate": 0.00015111816995691809, + "loss": 0.7685, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5064275726334223, + "learning_rate": 0.00015089521687626243, + "loss": 0.8493, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.468520711844833, + "learning_rate": 0.00015067192183881658, + "loss": 0.6597, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.44373890533763766, + "learning_rate": 0.000150448286344864, + "loss": 0.7828, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.3758784537131547, + "learning_rate": 0.00015022431189697568, + "loss": 0.7145, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.5414512159240334, + "learning_rate": 0.00015000000000000001, + "loss": 0.8886, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.46836934499681643, + "learning_rate": 0.0001497753521610526, + "loss": 0.8519, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.5478216473576355, + "learning_rate": 0.00014955036988950618, + "loss": 0.8393, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.6188204660715885, + "learning_rate": 0.00014932505469698052, + "loss": 0.9017, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.520421586764439, + "learning_rate": 0.00014909940809733222, + "loss": 0.8611, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.48349042295159567, + "learning_rate": 0.0001488734316066446, + "loss": 0.8638, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.6761883802868082, + "learning_rate": 0.00014864712674321734, + "loss": 0.7786, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.4161033990297297, + "learning_rate": 0.0001484204950275565, + "loss": 0.726, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.5637465759314259, + "learning_rate": 0.00014819353798236427, + "loss": 0.8283, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.4763183478858337, + "learning_rate": 0.00014796625713252848, + "loss": 0.7195, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.5708493331226704, + "learning_rate": 0.00014773865400511272, + "loss": 0.8474, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.5242772582100799, + "learning_rate": 0.00014751073012934587, + "loss": 0.8982, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4342083988934154, + "learning_rate": 0.00014728248703661182, + "loss": 0.6962, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.4241316584309476, + "learning_rate": 0.0001470539262604393, + "loss": 0.779, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.5512281159501403, + "learning_rate": 0.00014682504933649144, + "loss": 0.9414, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.47281763704161467, + "learning_rate": 0.00014659585780255556, + "loss": 0.7479, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5253262322318374, + "learning_rate": 0.00014636635319853275, + "loss": 0.9399, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.4713053988398959, + "learning_rate": 0.0001461365370664276, + "loss": 0.751, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.48804668492107184, + "learning_rate": 0.00014590641095033787, + "loss": 0.7617, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.4894482088008986, + "learning_rate": 0.00014567597639644387, + "loss": 0.8002, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.4960660530804454, + "learning_rate": 0.00014544523495299842, + "loss": 0.8239, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.4336854030947481, + "learning_rate": 0.00014521418817031628, + "loss": 0.7727, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5365196664802592, + "learning_rate": 0.0001449828376007636, + "loss": 0.7533, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.46916503004233767, + "learning_rate": 0.00014475118479874774, + "loss": 0.7548, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.7009035094639057, + "learning_rate": 0.0001445192313207067, + "loss": 0.7939, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.45369423347572424, + "learning_rate": 0.0001442869787250987, + "loss": 0.7162, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5406665246930531, + "learning_rate": 0.0001440544285723915, + "loss": 0.8502, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.4098289899450824, + "learning_rate": 0.00014382158242505234, + "loss": 0.7633, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.5308195138806584, + "learning_rate": 0.00014358844184753712, + "loss": 0.8122, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.5017791079272266, + "learning_rate": 0.00014335500840627986, + "loss": 0.8365, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.6353701531931035, + "learning_rate": 0.00014312128366968243, + "loss": 0.8842, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.5828084171271591, + "learning_rate": 0.0001428872692081038, + "loss": 0.8396, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.49085125369595195, + "learning_rate": 0.00014265296659384956, + "loss": 0.8298, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.5753743296959409, + "learning_rate": 0.00014241837740116132, + "loss": 0.854, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4753494413053485, + "learning_rate": 0.00014218350320620624, + "loss": 0.7404, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.5012155769884424, + "learning_rate": 0.00014194834558706632, + "loss": 0.8808, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.4641152707086399, + "learning_rate": 0.0001417129061237278, + "loss": 0.7923, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.6036177751051387, + "learning_rate": 0.0001414771863980707, + "loss": 0.8972, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.49537016981133947, + "learning_rate": 0.00014124118799385796, + "loss": 0.8381, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.47387744660741005, + "learning_rate": 0.00014100491249672498, + "loss": 0.8017, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.5346572691831166, + "learning_rate": 0.00014076836149416887, + "loss": 0.7592, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.39518148845260165, + "learning_rate": 0.0001405315365755379, + "loss": 0.7655, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5086735680516187, + "learning_rate": 0.0001402944393320206, + "loss": 0.9133, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.5183446764477347, + "learning_rate": 0.00014005707135663527, + "loss": 0.8268, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.5079181549084607, + "learning_rate": 0.00013981943424421932, + "loss": 0.7564, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.5723226508908508, + "learning_rate": 0.00013958152959141825, + "loss": 0.8547, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.43762737232927784, + "learning_rate": 0.00013934335899667527, + "loss": 0.7613, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.5777902349863276, + "learning_rate": 0.00013910492406022033, + "loss": 0.8169, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.46119697962249157, + "learning_rate": 0.00013886622638405952, + "loss": 0.693, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.5679592046461474, + "learning_rate": 0.0001386272675719642, + "loss": 0.8662, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.5143643118251245, + "learning_rate": 0.00013838804922946027, + "loss": 0.784, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.5327345799873556, + "learning_rate": 0.00013814857296381728, + "loss": 0.8125, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4595260751518562, + "learning_rate": 0.00013790884038403795, + "loss": 0.7549, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.5045007852387343, + "learning_rate": 0.00013766885310084688, + "loss": 0.8154, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4463937954372007, + "learning_rate": 0.00013742861272668012, + "loss": 0.6961, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.552245733737881, + "learning_rate": 0.00013718812087567414, + "loss": 0.8687, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.49848544400728884, + "learning_rate": 0.00013694737916365517, + "loss": 0.7747, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.5223353961366203, + "learning_rate": 0.000136706389208128, + "loss": 0.7559, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.6075208329067229, + "learning_rate": 0.00013646515262826552, + "loss": 0.8909, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.815476728266723, + "learning_rate": 0.00013622367104489756, + "loss": 0.9289, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.643028271104798, + "learning_rate": 0.0001359819460805001, + "loss": 0.8244, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.48280726217974457, + "learning_rate": 0.0001357399793591844, + "loss": 0.828, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5439674438982885, + "learning_rate": 0.0001354977725066859, + "loss": 0.8534, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.4687977100600042, + "learning_rate": 0.00013525532715035366, + "loss": 0.7626, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.433287770246132, + "learning_rate": 0.00013501264491913906, + "loss": 0.7956, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.5126930198527264, + "learning_rate": 0.00013476972744358507, + "loss": 0.832, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.6354646818331343, + "learning_rate": 0.0001345265763558152, + "loss": 0.9188, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.6326475734716094, + "learning_rate": 0.00013428319328952253, + "loss": 0.9524, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.41664299189410514, + "learning_rate": 0.00013403957987995882, + "loss": 0.7067, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.4740842085361335, + "learning_rate": 0.0001337957377639235, + "loss": 0.6943, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.48662916484067625, + "learning_rate": 0.0001335516685797525, + "loss": 0.7801, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.4918517295772121, + "learning_rate": 0.0001333073739673076, + "loss": 0.7931, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4631998368398158, + "learning_rate": 0.00013306285556796495, + "loss": 0.7924, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.6379201450826519, + "learning_rate": 0.0001328181150246045, + "loss": 0.9894, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4242585975126765, + "learning_rate": 0.00013257315398159864, + "loss": 0.7472, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.5478588902325013, + "learning_rate": 0.00013232797408480127, + "loss": 0.9123, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.5389180561217947, + "learning_rate": 0.00013208257698153677, + "loss": 0.8135, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.585498259232321, + "learning_rate": 0.00013183696432058888, + "loss": 0.9488, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.5871634185845148, + "learning_rate": 0.00013159113775218964, + "loss": 0.9312, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.562709572212109, + "learning_rate": 0.00013134509892800822, + "loss": 0.8945, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4825365262187394, + "learning_rate": 0.00013109884950114007, + "loss": 0.78, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.5558307139548955, + "learning_rate": 0.00013085239112609547, + "loss": 0.8581, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.5099124822237073, + "learning_rate": 0.00013060572545878875, + "loss": 0.8459, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.5651918845767404, + "learning_rate": 0.00013035885415652685, + "loss": 0.8585, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.594763452897937, + "learning_rate": 0.00013011177887799845, + "loss": 0.9477, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.5108837346519073, + "learning_rate": 0.00012986450128326266, + "loss": 0.8448, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.6149527213658991, + "learning_rate": 0.00012961702303373795, + "loss": 0.8743, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.5190735988917229, + "learning_rate": 0.00012936934579219094, + "loss": 0.8117, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.6081118249705028, + "learning_rate": 0.00012912147122272523, + "loss": 0.8562, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.4608503311720638, + "learning_rate": 0.00012887340099077024, + "loss": 0.7288, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.6559202185899272, + "learning_rate": 0.00012862513676307008, + "loss": 0.8916, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.47914269014723737, + "learning_rate": 0.0001283766802076722, + "loss": 0.7825, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.569294625307409, + "learning_rate": 0.00012812803299391628, + "loss": 0.8941, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.5115071926180038, + "learning_rate": 0.00012787919679242306, + "loss": 0.8166, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.43673382782497544, + "learning_rate": 0.00012763017327508305, + "loss": 0.8108, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.4193508222034899, + "learning_rate": 0.00012738096411504522, + "loss": 0.6614, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.5444916476669717, + "learning_rate": 0.0001271315709867059, + "loss": 0.8307, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.39709390155112134, + "learning_rate": 0.00012688199556569753, + "loss": 0.7885, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.4663528741990849, + "learning_rate": 0.00012663223952887723, + "loss": 0.7717, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.44767087061840327, + "learning_rate": 0.0001263823045543158, + "loss": 0.7243, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.5400025740138015, + "learning_rate": 0.00012613219232128608, + "loss": 0.9107, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.7242760382075248, + "learning_rate": 0.00012588190451025207, + "loss": 0.928, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4699883135672619, + "learning_rate": 0.00012563144280285741, + "loss": 0.8147, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.4507172457148455, + "learning_rate": 0.00012538080888191408, + "loss": 0.7147, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.4477273850438377, + "learning_rate": 0.00012513000443139112, + "loss": 0.7828, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.573213467575292, + "learning_rate": 0.00012487903113640337, + "loss": 0.9016, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.7395294482787395, + "learning_rate": 0.00012462789068320017, + "loss": 0.8013, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.4278327164826983, + "learning_rate": 0.00012437658475915377, + "loss": 0.722, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.5481371702328152, + "learning_rate": 0.00012412511505274844, + "loss": 0.7552, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.4438702046348752, + "learning_rate": 0.00012387348325356874, + "loss": 0.7923, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.49339323979189215, + "learning_rate": 0.00012362169105228826, + "loss": 0.6711, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.43469574787777077, + "learning_rate": 0.00012336974014065844, + "loss": 0.7608, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.45469460771454245, + "learning_rate": 0.000123117632211497, + "loss": 0.7548, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.4406509515165732, + "learning_rate": 0.00012286536895867654, + "loss": 0.7219, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.49441488602329037, + "learning_rate": 0.00012261295207711346, + "loss": 0.7927, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.7379867681521697, + "learning_rate": 0.00012236038326275626, + "loss": 0.9384, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4419917602257873, + "learning_rate": 0.0001221076642125742, + "loss": 0.6832, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.5889559609480378, + "learning_rate": 0.00012185479662454595, + "loss": 0.7841, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.40878678657385226, + "learning_rate": 0.00012160178219764837, + "loss": 0.7209, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.4902067343879289, + "learning_rate": 0.00012134862263184467, + "loss": 0.7689, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.39543419743016395, + "learning_rate": 0.00012109531962807332, + "loss": 0.7314, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.5023813018744453, + "learning_rate": 0.00012084187488823657, + "loss": 0.7474, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.49204425833848103, + "learning_rate": 0.00012058829011518896, + "loss": 0.8135, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.5434543681211457, + "learning_rate": 0.00012033456701272576, + "loss": 0.9608, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3584608607245962, + "learning_rate": 0.00012008070728557186, + "loss": 0.7393, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.5421303928890512, + "learning_rate": 0.00011982671263936995, + "loss": 0.8541, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.5041842986860352, + "learning_rate": 0.00011957258478066931, + "loss": 0.7802, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.5686617552697284, + "learning_rate": 0.00011931832541691418, + "loss": 0.7993, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.5174441794883738, + "learning_rate": 0.00011906393625643244, + "loss": 0.8221, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.4343168867662223, + "learning_rate": 0.00011880941900842397, + "loss": 0.7197, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.6260800137721261, + "learning_rate": 0.00011855477538294935, + "loss": 0.8975, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.4406198345363788, + "learning_rate": 0.00011830000709091815, + "loss": 0.7761, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.46177843371607763, + "learning_rate": 0.00011804511584407763, + "loss": 0.7759, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.5577682154881546, + "learning_rate": 0.0001177901033550012, + "loss": 0.8044, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5629217703960968, + "learning_rate": 0.00011753497133707679, + "loss": 0.9484, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.47427370070049796, + "learning_rate": 0.00011727972150449544, + "loss": 0.8189, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.43541803965039383, + "learning_rate": 0.00011702435557223987, + "loss": 0.7736, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.5604636650499492, + "learning_rate": 0.00011676887525607271, + "loss": 0.7894, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.40486175581997225, + "learning_rate": 0.00011651328227252517, + "loss": 0.7001, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.6116138036064188, + "learning_rate": 0.00011625757833888551, + "loss": 0.968, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.47050393754892433, + "learning_rate": 0.00011600176517318741, + "loss": 0.7368, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.6197201726874729, + "learning_rate": 0.0001157458444941984, + "loss": 0.8447, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.43127498601075004, + "learning_rate": 0.00011548981802140848, + "loss": 0.7601, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.5239926917161649, + "learning_rate": 0.00011523368747501839, + "loss": 0.7749, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.5557068205548962, + "learning_rate": 0.00011497745457592816, + "loss": 0.7991, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.38524225379152355, + "learning_rate": 0.00011472112104572547, + "loss": 0.6911, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.5867606517164021, + "learning_rate": 0.00011446468860667421, + "loss": 0.8164, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.5524784254512559, + "learning_rate": 0.0001142081589817027, + "loss": 0.8913, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.5090520257090125, + "learning_rate": 0.00011395153389439233, + "loss": 0.7865, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.4061404196162076, + "learning_rate": 0.00011369481506896582, + "loss": 0.7403, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3887932548504206, + "learning_rate": 0.00011343800423027582, + "loss": 0.7083, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.4300486673673649, + "learning_rate": 0.00011318110310379301, + "loss": 0.7345, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.44405217119720175, + "learning_rate": 0.0001129241134155949, + "loss": 0.7479, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.5407904878583915, + "learning_rate": 0.00011266703689235394, + "loss": 0.8068, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.6441015602700922, + "learning_rate": 0.00011240987526132594, + "loss": 0.883, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.48156565405133755, + "learning_rate": 0.00011215263025033869, + "loss": 0.7746, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.5509487651952958, + "learning_rate": 0.00011189530358778005, + "loss": 0.7485, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.6022651603352311, + "learning_rate": 0.00011163789700258655, + "loss": 0.9138, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.5698457697147494, + "learning_rate": 0.00011138041222423177, + "loss": 0.824, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.37478841358951825, + "learning_rate": 0.00011112285098271451, + "loss": 0.7013, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.5937745460335725, + "learning_rate": 0.00011086521500854745, + "loss": 0.9156, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.4117617011434398, + "learning_rate": 0.00011060750603274535, + "loss": 0.7609, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.452191062612939, + "learning_rate": 0.00011034972578681338, + "loss": 0.7114, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.5080631766111878, + "learning_rate": 0.00011009187600273566, + "loss": 0.7616, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.46197159052348113, + "learning_rate": 0.00010983395841296348, + "loss": 0.7823, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.5913203255644318, + "learning_rate": 0.00010957597475040373, + "loss": 0.8534, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4913816367545617, + "learning_rate": 0.00010931792674840718, + "loss": 0.8438, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.4321676807811443, + "learning_rate": 0.00010905981614075693, + "loss": 0.6898, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.43210841415093076, + "learning_rate": 0.00010880164466165674, + "loss": 0.7677, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.4399667333715279, + "learning_rate": 0.00010854341404571928, + "loss": 0.7211, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5513206722172173, + "learning_rate": 0.00010828512602795462, + "loss": 0.8129, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.4525528282854611, + "learning_rate": 0.00010802678234375851, + "loss": 0.7782, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.38156065752706614, + "learning_rate": 0.00010776838472890065, + "loss": 0.6655, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.7079666217225821, + "learning_rate": 0.0001075099349195131, + "loss": 0.9053, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.584023598611178, + "learning_rate": 0.00010725143465207867, + "loss": 0.8989, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.5537053421798345, + "learning_rate": 0.00010699288566341914, + "loss": 0.8031, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5226360487155243, + "learning_rate": 0.00010673428969068364, + "loss": 0.8439, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.4522843280300031, + "learning_rate": 0.000106475648471337, + "loss": 0.7835, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.48848173792589555, + "learning_rate": 0.00010621696374314807, + "loss": 0.8099, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.45958022507217944, + "learning_rate": 0.00010595823724417795, + "loss": 0.7018, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.41466007031314833, + "learning_rate": 0.00010569947071276847, + "loss": 0.7065, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.425364937392599, + "learning_rate": 0.00010544066588753044, + "loss": 0.7512, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5096590210775217, + "learning_rate": 0.00010518182450733186, + "loss": 0.7954, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.48648410550376947, + "learning_rate": 0.00010492294831128641, + "loss": 0.8412, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.623289539500742, + "learning_rate": 0.00010466403903874176, + "loss": 0.8941, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.5677803983975525, + "learning_rate": 0.00010440509842926767, + "loss": 0.841, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.34105874371506884, + "learning_rate": 0.00010414612822264455, + "loss": 0.5849, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.6244999281906921, + "learning_rate": 0.00010388713015885161, + "loss": 0.8543, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.5141705732697385, + "learning_rate": 0.00010362810597805526, + "loss": 0.8356, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.5386242018582555, + "learning_rate": 0.00010336905742059742, + "loss": 0.7775, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.6612636754644633, + "learning_rate": 0.0001031099862269837, + "loss": 0.9341, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.5066295764943997, + "learning_rate": 0.0001028508941378719, + "loss": 0.7679, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.5929415431147723, + "learning_rate": 0.00010259178289406011, + "loss": 0.8538, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.4823545097198918, + "learning_rate": 0.00010233265423647523, + "loss": 0.8509, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.45011328394148964, + "learning_rate": 0.00010207350990616107, + "loss": 0.7047, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.4957158187778744, + "learning_rate": 0.00010181435164426676, + "loss": 0.8761, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.5131984276569533, + "learning_rate": 0.0001015551811920351, + "loss": 0.8019, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.6623000702243821, + "learning_rate": 0.00010129600029079072, + "loss": 0.8546, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.372888034511951, + "learning_rate": 0.00010103681068192845, + "loss": 0.719, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.48089725992376436, + "learning_rate": 0.00010077761410690172, + "loss": 0.7641, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4869011180856098, + "learning_rate": 0.00010051841230721065, + "loss": 0.8183, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.6683558921353259, + "learning_rate": 0.00010025920702439051, + "loss": 0.8939, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.510736639991274, + "learning_rate": 0.0001, + "loss": 0.7993, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.5055250734081491, + "learning_rate": 9.97407929756095e-05, + "loss": 0.808, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.6225426629122933, + "learning_rate": 9.948158769278939e-05, + "loss": 0.9013, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.5637350549578577, + "learning_rate": 9.92223858930983e-05, + "loss": 0.7932, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.46993838974226765, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7825, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.5196216125995946, + "learning_rate": 9.870399970920932e-05, + "loss": 0.9727, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.5130227828649497, + "learning_rate": 9.844481880796491e-05, + "loss": 0.718, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.5286992190561343, + "learning_rate": 9.818564835573323e-05, + "loss": 0.7683, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6290709225831332, + "learning_rate": 9.792649009383899e-05, + "loss": 0.8726, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.40958606639063, + "learning_rate": 9.766734576352478e-05, + "loss": 0.6989, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.5992144491797667, + "learning_rate": 9.740821710593989e-05, + "loss": 0.8579, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.5408709821613941, + "learning_rate": 9.714910586212816e-05, + "loss": 0.7791, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.5603092739646592, + "learning_rate": 9.689001377301633e-05, + "loss": 0.887, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.4076298331910016, + "learning_rate": 9.663094257940258e-05, + "loss": 0.7167, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.48742935888812106, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7307, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.6328343040471255, + "learning_rate": 9.611286984114841e-05, + "loss": 0.9219, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.525172619120507, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7884, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.5533911039815641, + "learning_rate": 9.559490157073236e-05, + "loss": 0.7802, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.5248090780104834, + "learning_rate": 9.533596096125825e-05, + "loss": 0.9503, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.4546716340365706, + "learning_rate": 9.507705168871358e-05, + "loss": 0.6962, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.539918735292058, + "learning_rate": 9.481817549266817e-05, + "loss": 0.9195, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.461873164842901, + "learning_rate": 9.455933411246958e-05, + "loss": 0.7413, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.5404774421220055, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7735, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.48150403308113476, + "learning_rate": 9.404176275582208e-05, + "loss": 0.776, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4292803600015022, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7823, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.521600142890234, + "learning_rate": 9.352435152866298e-05, + "loss": 0.7734, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.46376561189326265, + "learning_rate": 9.326571030931637e-05, + "loss": 0.8263, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.509629506645239, + "learning_rate": 9.300711433658087e-05, + "loss": 0.8912, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4393346835557655, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7385, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.40822658272155166, + "learning_rate": 9.249006508048694e-05, + "loss": 0.7319, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.5102282881706691, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7441, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.500285468354489, + "learning_rate": 9.197321765624152e-05, + "loss": 0.8649, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5955534441368605, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7089, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.514751343769818, + "learning_rate": 9.145658595428074e-05, + "loss": 0.7919, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.6202458274255608, + "learning_rate": 9.119835533834331e-05, + "loss": 0.8531, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.5370911720219436, + "learning_rate": 9.09401838592431e-05, + "loss": 0.8021, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.4596200244567425, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7639, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.4432096354968599, + "learning_rate": 9.04240252495963e-05, + "loss": 0.7892, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.6913138897781747, + "learning_rate": 9.016604158703654e-05, + "loss": 0.9536, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.5368223534102192, + "learning_rate": 8.990812399726435e-05, + "loss": 0.7887, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4470921354684901, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6637, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.5052879442478138, + "learning_rate": 8.939249396725467e-05, + "loss": 0.7381, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.6471533979972708, + "learning_rate": 8.913478499145254e-05, + "loss": 0.8575, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.46346575110280064, + "learning_rate": 8.887714901728551e-05, + "loss": 0.715, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.40234694118173886, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7428, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.5984049938457736, + "learning_rate": 8.836210299741346e-05, + "loss": 0.9005, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.5026897204768275, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7645, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.510315330137784, + "learning_rate": 8.784736974966135e-05, + "loss": 0.7703, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.43250426602522235, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7601, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.4267727818094612, + "learning_rate": 8.733296310764611e-05, + "loss": 0.7808, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.6966319221424616, + "learning_rate": 8.707588658440511e-05, + "loss": 0.9314, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.5901768920926003, + "learning_rate": 8.6818896896207e-05, + "loss": 0.8581, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5357043052840119, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7986, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.48810903994513444, + "learning_rate": 8.63051849310342e-05, + "loss": 0.8064, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.4772440063206294, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7047, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.42291427247242586, + "learning_rate": 8.579184101829734e-05, + "loss": 0.7234, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.5168772230534443, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7791, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.5716681716869837, + "learning_rate": 8.527887895427454e-05, + "loss": 0.8252, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.5754784563452197, + "learning_rate": 8.502254542407186e-05, + "loss": 0.8623, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.6061252279312543, + "learning_rate": 8.476631252498162e-05, + "loss": 0.8743, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.5159057110672841, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7723, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.6158818624972439, + "learning_rate": 8.425415550580162e-05, + "loss": 0.7882, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.6907244050654523, + "learning_rate": 8.399823482681262e-05, + "loss": 0.8958, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.5451679905966559, + "learning_rate": 8.374242166111448e-05, + "loss": 0.7427, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.5869039308703426, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7315, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.5163326015460028, + "learning_rate": 8.323112474392731e-05, + "loss": 0.8171, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.4555044357400919, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7902, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.5179276976344805, + "learning_rate": 8.272027849550457e-05, + "loss": 0.7577, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.5098795094050994, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7619, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.5744928900086157, + "learning_rate": 8.220989664499878e-05, + "loss": 0.8035, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4339301950666902, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7728, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.48392687897486975, + "learning_rate": 8.169999290908188e-05, + "loss": 0.7994, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5577276559513615, + "learning_rate": 8.144522461705067e-05, + "loss": 0.9411, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.3813800760512216, + "learning_rate": 8.119058099157604e-05, + "loss": 0.7255, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.44710205088526395, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6141, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.5734428231196345, + "learning_rate": 8.068167458308582e-05, + "loss": 0.8319, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.471162491579229, + "learning_rate": 8.042741521933071e-05, + "loss": 0.707, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.5268331597214416, + "learning_rate": 8.017328736063006e-05, + "loss": 0.7831, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.5512066366460038, + "learning_rate": 7.991929271442817e-05, + "loss": 0.8724, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.45159331766129185, + "learning_rate": 7.966543298727425e-05, + "loss": 0.7226, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5628669189861594, + "learning_rate": 7.941170988481108e-05, + "loss": 0.8246, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.4489717846993135, + "learning_rate": 7.915812511176347e-05, + "loss": 0.7273, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.5931957639193516, + "learning_rate": 7.89046803719267e-05, + "loss": 1.016, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.6878979745323104, + "learning_rate": 7.865137736815535e-05, + "loss": 0.8393, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.48279145262903267, + "learning_rate": 7.839821780235168e-05, + "loss": 0.7742, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.45784299054923344, + "learning_rate": 7.814520337545406e-05, + "loss": 0.7713, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.6114375823080654, + "learning_rate": 7.789233578742582e-05, + "loss": 0.8754, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.5131715129078588, + "learning_rate": 7.763961673724379e-05, + "loss": 0.7944, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.5131994906415137, + "learning_rate": 7.738704792288655e-05, + "loss": 0.797, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.5822445031872177, + "learning_rate": 7.713463104132345e-05, + "loss": 0.8634, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.43485343215141814, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7476, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.4613738032755144, + "learning_rate": 7.663025985934158e-05, + "loss": 0.7776, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.48501401501599023, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7265, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.37660273663796184, + "learning_rate": 7.61265167464313e-05, + "loss": 0.6849, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.5386722069111288, + "learning_rate": 7.587488494725157e-05, + "loss": 0.8196, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.4242962757171859, + "learning_rate": 7.562341524084623e-05, + "loss": 0.7498, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.5741200059037986, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7797, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.5204073806313823, + "learning_rate": 7.512096886359664e-05, + "loss": 0.8835, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.5076601851050411, + "learning_rate": 7.48699955686089e-05, + "loss": 0.8621, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.5179380741278033, + "learning_rate": 7.461919111808595e-05, + "loss": 0.7959, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.5888857086694583, + "learning_rate": 7.43685571971426e-05, + "loss": 0.8564, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.5008378451925248, + "learning_rate": 7.411809548974792e-05, + "loss": 0.7997, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.44589256233006075, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7606, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.4116672820082607, + "learning_rate": 7.361769544568425e-05, + "loss": 0.6629, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.515604754170254, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6872, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.449656904421517, + "learning_rate": 7.311800443430251e-05, + "loss": 0.7218, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.4423761588323261, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7004, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.43304351381495504, + "learning_rate": 7.26190358849548e-05, + "loss": 0.7544, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.388949384344476, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7087, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.45049895544234086, + "learning_rate": 7.212080320757695e-05, + "loss": 0.7291, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.7366164282084258, + "learning_rate": 7.187196700608373e-05, + "loss": 0.9146, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.46491414551503013, + "learning_rate": 7.162331979232783e-05, + "loss": 0.7666, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.42867949944378453, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7032, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.5147435436229524, + "learning_rate": 7.112659900922976e-05, + "loss": 0.8262, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5367661938820013, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7697, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.5824157256969407, + "learning_rate": 7.06306542078091e-05, + "loss": 0.7554, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.38534545354199295, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6849, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.5284622775081942, + "learning_rate": 7.013549871673736e-05, + "loss": 0.8728, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.480831013120713, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7724, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.4294159155043536, + "learning_rate": 6.964114584347316e-05, + "loss": 0.6737, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5485800019065085, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6973, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.5378477773759867, + "learning_rate": 6.914760887390452e-05, + "loss": 0.8079, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.6023163155494816, + "learning_rate": 6.890115049885994e-05, + "loss": 0.8248, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.5365760352930691, + "learning_rate": 6.865490107199181e-05, + "loss": 0.7639, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4530385017080909, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7566, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.5552262914060541, + "learning_rate": 6.816303567941112e-05, + "loss": 0.7683, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.5542391995409183, + "learning_rate": 6.791742301846326e-05, + "loss": 0.9047, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.49364800684924487, + "learning_rate": 6.767202591519875e-05, + "loss": 0.6982, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.474981388028254, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7669, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.4926565804261099, + "learning_rate": 6.718188497539554e-05, + "loss": 0.7552, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.6103439077637114, + "learning_rate": 6.693714443203507e-05, + "loss": 0.805, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.7709192308181996, + "learning_rate": 6.669262603269246e-05, + "loss": 0.915, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.6386787972460591, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7317, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.5065628984406876, + "learning_rate": 6.620426223607654e-05, + "loss": 0.7835, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.5436850226643325, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7782, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.5753151796667159, + "learning_rate": 6.571680671047749e-05, + "loss": 0.7177, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.5575765980076977, + "learning_rate": 6.547342364418481e-05, + "loss": 0.8319, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.47209779716940015, + "learning_rate": 6.523027255641493e-05, + "loss": 0.8046, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4895384765337746, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7902, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.454757882406579, + "learning_rate": 6.474467284964634e-05, + "loss": 0.7323, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5787640937084728, + "learning_rate": 6.450222749331414e-05, + "loss": 0.8592, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.43575556952847144, + "learning_rate": 6.426002064081565e-05, + "loss": 0.6981, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.5599270657727717, + "learning_rate": 6.40180539194999e-05, + "loss": 0.8224, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.3927164317907513, + "learning_rate": 6.377632895510248e-05, + "loss": 0.7449, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5876453267453996, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7211, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.5284880195456529, + "learning_rate": 6.329361079187199e-05, + "loss": 0.7267, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.4344724546277582, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7286, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.47331754558605854, + "learning_rate": 6.281187912432587e-05, + "loss": 0.7227, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.42289814294337735, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7804, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.4597744885394716, + "learning_rate": 6.233114689915316e-05, + "loss": 0.7347, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.6886867743546877, + "learning_rate": 6.209115961596208e-05, + "loss": 0.8761, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.5704387953894104, + "learning_rate": 6.18514270361827e-05, + "loss": 0.7524, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.7980508446517586, + "learning_rate": 6.161195077053976e-05, + "loss": 0.8903, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.6561748021138134, + "learning_rate": 6.13727324280358e-05, + "loss": 0.906, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.3997245264461394, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7133, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.49172147669387856, + "learning_rate": 6.08950759397797e-05, + "loss": 0.7171, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.47494727743860726, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6789, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.509205059753719, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.7476, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.44098130040514394, + "learning_rate": 6.018056575578075e-05, + "loss": 0.8466, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.4554178807918066, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.8006, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5317635887247807, + "learning_rate": 5.970556066797941e-05, + "loss": 0.9353, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.37059723794483423, + "learning_rate": 5.946846342446214e-05, + "loss": 0.6507, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.4702183008942206, + "learning_rate": 5.923163850583113e-05, + "loss": 0.792, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.4551255320821976, + "learning_rate": 5.899508750327501e-05, + "loss": 0.7764, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.6339311550272567, + "learning_rate": 5.875881200614207e-05, + "loss": 0.8613, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.4761527444118468, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.6806, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.4615038937896931, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7258, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.4606585651526168, + "learning_rate": 5.80516544129337e-05, + "loss": 0.7436, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4253710733831671, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6899, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.48024005565934813, + "learning_rate": 5.758162259883867e-05, + "loss": 0.718, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.5152988534627136, + "learning_rate": 5.73470334061505e-05, + "loss": 0.889, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.5324023326915394, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.7591, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.44548172271198205, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7246, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.453129656700259, + "learning_rate": 5.664499159372017e-05, + "loss": 0.6994, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.6138800607670666, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.8983, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.5924138427272913, + "learning_rate": 5.617841757494762e-05, + "loss": 0.7872, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.41836352920609127, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7172, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.5060582567527775, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.7465, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.5100761142075639, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7791, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.4748718110304019, + "learning_rate": 5.524881520125229e-05, + "loss": 0.7087, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4781754943952788, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7846, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.5116215246876256, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.9123, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.5653042070555684, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.8254, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.4099161045841414, + "learning_rate": 5.432402360355615e-05, + "loss": 0.77, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.5907690848546384, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.8821, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.38933980873615237, + "learning_rate": 5.386346293357242e-05, + "loss": 0.6871, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.445542876932296, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7841, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.6335858845221375, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.997, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4803421356568687, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7519, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.6064940252059536, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.8153, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4128777697333255, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6295, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.4591857654670067, + "learning_rate": 5.248926987065417e-05, + "loss": 0.7009, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.5819461821891235, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7929, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.4556561799817687, + "learning_rate": 5.203374286747158e-05, + "loss": 0.7401, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.511193692242379, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7604, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.5125207401474582, + "learning_rate": 5.15795049724435e-05, + "loss": 0.7768, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.43482903797591776, + "learning_rate": 5.135287325678271e-05, + "loss": 0.73, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.40158455350863376, + "learning_rate": 5.112656839335543e-05, + "loss": 0.6212, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4794802313794276, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6973, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.6134753346845444, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.7147, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4247728899241009, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7204, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.4746055926663477, + "learning_rate": 5.022464783894744e-05, + "loss": 0.8316, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.4563585858381452, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7573, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.6514548402961644, + "learning_rate": 4.977568810302432e-05, + "loss": 0.8697, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4395184935238493, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7215, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.4252562174952073, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.7515, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.4227675902507199, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7169, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.4751010569172859, + "learning_rate": 4.88818300430819e-05, + "loss": 0.6626, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.44882287012285954, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7743, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.56196232380045, + "learning_rate": 4.843695574177737e-05, + "loss": 0.7914, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.5483852356476023, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7902, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.6439704028827931, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.899, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5663217598956615, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7919, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.49501494595816853, + "learning_rate": 4.755137637685979e-05, + "loss": 0.7635, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.5930865783379573, + "learning_rate": 4.733085880741301e-05, + "loss": 0.8109, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.5111538383222523, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.8551, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.4724704337446228, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7325, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.4676027869294293, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.7118, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.5366310858073888, + "learning_rate": 4.645234206515171e-05, + "loss": 0.9235, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.45322326200589436, + "learning_rate": 4.623360864173893e-05, + "loss": 0.6605, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5001990346277503, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7978, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.40182637606093063, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6705, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.46906673716168085, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7007, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.5362371092900002, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.7112, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5078116208393267, + "learning_rate": 4.514538954847064e-05, + "loss": 0.795, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.4556106823335202, + "learning_rate": 4.492884557078688e-05, + "loss": 0.6837, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.6318423018303347, + "learning_rate": 4.471267160734731e-05, + "loss": 0.8347, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.6495688410104123, + "learning_rate": 4.449686911058992e-05, + "loss": 0.8639, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.46876871290667643, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7366, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.4011594116666654, + "learning_rate": 4.406638431438576e-05, + "loss": 0.7196, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.49479019610079433, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6864, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.4815943792182797, + "learning_rate": 4.36374027515878e-05, + "loss": 0.775, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4579322955767885, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7325, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.4991315092613472, + "learning_rate": 4.320993595120969e-05, + "loss": 0.7522, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.48310010668510406, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.7525, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.5308408656927481, + "learning_rate": 4.278399540155536e-05, + "loss": 0.6725, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.4659049002583655, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7696, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.4987725525086302, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.6749, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.4604305217685288, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6969, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.3745632467390563, + "learning_rate": 4.193673880223339e-05, + "loss": 0.687, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.4212724462363019, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7271, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.5905873954325088, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.7435, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.43581393035533844, + "learning_rate": 4.130538759866457e-05, + "loss": 0.8126, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.43467533201659003, + "learning_rate": 4.109572403415386e-05, + "loss": 0.6746, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4294076965433029, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7496, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.8322364025954283, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.898, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.4999601809894697, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7144, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.49987777257007493, + "learning_rate": 4.026104150684835e-05, + "loss": 0.7348, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5329345377491767, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7592, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.38317445403290057, + "learning_rate": 3.984610290059467e-05, + "loss": 0.6683, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5080676931856752, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7267, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.47221572673796364, + "learning_rate": 3.943278094912946e-05, + "loss": 0.7896, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4694812272910821, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7261, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.4662327987492834, + "learning_rate": 3.902108676060937e-05, + "loss": 0.7037, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.5021252185446033, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.8143, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.5225033992837377, + "learning_rate": 3.861103139944449e-05, + "loss": 0.7233, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.4105701621753054, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7242, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.5306223649945033, + "learning_rate": 3.820262588600074e-05, + "loss": 0.8186, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.4894162749572337, + "learning_rate": 3.79990452539225e-05, + "loss": 0.7235, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.46277641935711217, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.7005, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.6991867546461775, + "learning_rate": 3.759313507817196e-05, + "loss": 1.0024, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.49594568279315143, + "learning_rate": 3.739080826174498e-05, + "loss": 0.712, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.46694755255082615, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.782, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.5100406197855876, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.8262, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.5020707616239913, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7464, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.4353773917613402, + "learning_rate": 3.658572115866541e-05, + "loss": 0.7117, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.4619046343314709, + "learning_rate": 3.638551118512089e-05, + "loss": 0.7834, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.43278550793305576, + "learning_rate": 3.618572862711247e-05, + "loss": 0.6557, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.48220896332287105, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7391, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.5493168603240508, + "learning_rate": 3.578745112405083e-05, + "loss": 0.6951, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.45094635972687336, + "learning_rate": 3.558895885496023e-05, + "loss": 0.816, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.4891259920727296, + "learning_rate": 3.539089935331294e-05, + "loss": 0.7715, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4375019371148813, + "learning_rate": 3.519327394983888e-05, + "loss": 0.7016, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.41909776259069126, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.7253, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4234543390903855, + "learning_rate": 3.479933074573858e-05, + "loss": 0.665, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.46238881696974216, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.7774, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.5023095345955012, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7471, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.42649563909059657, + "learning_rate": 3.421170477595419e-05, + "loss": 0.6287, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4622928542748576, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7827, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.5917202929644055, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.7612, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4715229666077649, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7823, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.4382219635122664, + "learning_rate": 3.34343978560367e-05, + "loss": 0.734, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.5137357092717723, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7479, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.5566119421838533, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.8414, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.48515834899726845, + "learning_rate": 3.285610914348332e-05, + "loss": 0.7669, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.5399448894800771, + "learning_rate": 3.266424677350346e-05, + "loss": 0.7363, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.4218218798399454, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6361, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.4702185424668213, + "learning_rate": 3.228188057393895e-05, + "loss": 0.8182, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.6263706758365083, + "learning_rate": 3.209137931341143e-05, + "loss": 0.8523, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.6424858002914683, + "learning_rate": 3.190133432000252e-05, + "loss": 0.8708, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.5013551193313562, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.8237, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.592697303217561, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.8038, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5095286786464307, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7275, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.41610246174986487, + "learning_rate": 3.114574250902558e-05, + "loss": 0.7124, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.525044257671557, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.681, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.5187728408748397, + "learning_rate": 3.077071725875116e-05, + "loss": 0.7496, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.5954039487911745, + "learning_rate": 3.058390171511196e-05, + "loss": 0.7171, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.45051436916443294, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.7073, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.5648027876929638, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7821, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.47074310904855954, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.7567, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.36734367946250823, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6361, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.6807230093209986, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.9965, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4230860966116503, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.7158, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.4171111061257004, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.6374, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.48253610831141586, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.8264, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.4123366523890242, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.6718, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.49828993487937395, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7665, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.45664615319919927, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.7114, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.48335649902965766, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6716, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.4847550301151093, + "learning_rate": 2.819819423336775e-05, + "loss": 0.7474, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.5234526590176337, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.772, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.49879514783701684, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.7406, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.5033085718387407, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7003, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.38764469155216236, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.6239, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.5703766646578441, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7723, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.4234542614190962, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6501, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.5008835886481774, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.8263, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.599495073220037, + "learning_rate": 2.677041764010988e-05, + "loss": 0.7097, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5844611495743196, + "learning_rate": 2.659414712405398e-05, + "loss": 0.7835, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.43476950878045484, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.6799, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.48616257006951086, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7521, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.5436846517632036, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.7803, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.47154203209532536, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6548, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.45566972553858986, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.6729, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.7693258124254059, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7057, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.6801760699430367, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.7413, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.6048820714864578, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7702, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.48961143100612164, + "learning_rate": 2.503004759861258e-05, + "loss": 0.8139, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4121627801544965, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7368, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.44146407445745256, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.7054, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.49447203885631874, + "learning_rate": 2.451770608467432e-05, + "loss": 0.7304, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.599025188481658, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.8518, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.5079607966395224, + "learning_rate": 2.417867893002387e-05, + "loss": 0.7569, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.5186789106045554, + "learning_rate": 2.400992893100822e-05, + "loss": 0.7706, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.5603537997364025, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7536, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.5791561150960993, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.8087, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.43902036476656336, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7452, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.7575912669682205, + "learning_rate": 2.334004587234717e-05, + "loss": 0.8813, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.42945935489036363, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6217, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.44750100331178905, + "learning_rate": 2.300819024631603e-05, + "loss": 0.7209, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4409845770875295, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6433, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.46647686163124646, + "learning_rate": 2.26784037992395e-05, + "loss": 0.7116, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.5162949726365446, + "learning_rate": 2.251428928971102e-05, + "loss": 0.8006, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.5114649080326217, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.8247, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.49001671731893587, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7786, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.4223229927621443, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.6628, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.4803671003840686, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7072, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.5532220981211563, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.8015, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.43704204658882784, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.7021, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.5544186398509853, + "learning_rate": 2.138012622361689e-05, + "loss": 0.7479, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 1.0685852325949345, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.8271, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.4580950809239211, + "learning_rate": 2.106081749751897e-05, + "loss": 0.6717, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4178006662670166, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6687, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.5273656844850789, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.6818, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.43466594074759396, + "learning_rate": 2.058583491552465e-05, + "loss": 0.649, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.43858172586530747, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.6802, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.4186937573931191, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6171, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.5520115829546185, + "learning_rate": 2.011565445123711e-05, + "loss": 0.8476, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4708825634329752, + "learning_rate": 1.995999968955641e-05, + "loss": 0.7885, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.48267036334656843, + "learning_rate": 1.980488270378612e-05, + "loss": 0.7612, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5077422497180833, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6632, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.539435837601806, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.833, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4419758101276182, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7182, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.46706717847608514, + "learning_rate": 1.918981330958678e-05, + "loss": 0.7399, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3942868612201381, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6905, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.823933864388514, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.7783, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.4728779675632251, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7123, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.4374541321566614, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.5856, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4985701255460996, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6754, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.46546386224577047, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.7292, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.5099137877528729, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7328, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.46783382282539937, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.6792, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.46221528019887015, + "learning_rate": 1.783776873795994e-05, + "loss": 0.7198, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.4657759004524181, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.7583, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.4755406411752139, + "learning_rate": 1.754336106761927e-05, + "loss": 0.7101, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.45146092263453164, + "learning_rate": 1.739698775823442e-05, + "loss": 0.7212, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.48181166430899425, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6708, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.5121672276622006, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.7762, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.5347895896829294, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7214, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.5062691297853303, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.7986, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.45470665182666015, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.7639, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.5633270465411959, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.7965, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4632187262935168, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6665, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.574279183736144, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.8285, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.44387512333929124, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.7701, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.5253025020458589, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.7622, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.35988195955396635, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6074, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.44751945613534244, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.7025, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.4525032309828442, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7061, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.6537877188354023, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.9126, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4864319539069579, + "learning_rate": 1.526852950422226e-05, + "loss": 0.7534, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.4876601102451837, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.7088, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.40757454727223125, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.787, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.47892847598073485, + "learning_rate": 1.485810737340767e-05, + "loss": 0.7728, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.4757832511827876, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7495, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.5231076380426355, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.8446, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.4584636561692689, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.7148, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.4249478616706151, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.6406, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.5974664972050698, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7458, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.43967215657372255, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.7047, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3775520915989605, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6716, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.5605163401425338, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.6772, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.5621911471363742, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.8153, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.42912955154901866, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.66, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.5202569987929434, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7066, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.4807687247104095, + "learning_rate": 1.326814704364262e-05, + "loss": 0.7121, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.4986644316693339, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.7457, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.47792773262845173, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.6795, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.46685496422213985, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7332, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.48669661361282085, + "learning_rate": 1.275673273546758e-05, + "loss": 0.7764, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.4235981012865011, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6397, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.4904792258557603, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.8163, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.6050020644586003, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7383, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.5121518171335521, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.762, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.5721883730616573, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.8488, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.5478659357304045, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.7503, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.43895362313133507, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7001, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.5635448355820784, + "learning_rate": 1.176209418012495e-05, + "loss": 0.7955, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.49276395642104454, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7104, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.47743783074570756, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.7628, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5468138659604102, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.738, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.59119847348091, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.7701, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.7471754912626668, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.7783, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.6290087709600349, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.8718, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.4751492256677272, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7373, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.5120311906692161, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.7467, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.5690162969284462, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.7774, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.655117479287365, + "learning_rate": 1.057219974130903e-05, + "loss": 0.8564, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5215038621038415, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6966, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.578325587613857, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.8556, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4465886903437484, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7757, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.46492410138691087, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.7337, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.49953516173947843, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7506, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.44577082171877463, + "learning_rate": 9.887052838721322e-06, + "loss": 0.6879, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4685474938489625, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7219, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.4658071939604654, + "learning_rate": 9.663506046162985e-06, + "loss": 0.6495, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.6385767458322889, + "learning_rate": 9.552642710005299e-06, + "loss": 0.8595, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.43942390825182814, + "learning_rate": 9.44238707511862e-06, + "loss": 0.6651, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.5034642463474058, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7372, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.5006545717086979, + "learning_rate": 9.22370186822965e-06, + "loss": 0.7398, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.44933894560864746, + "learning_rate": 9.115273765538202e-06, + "loss": 0.623, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.5840445867283482, + "learning_rate": 9.0074563027294e-06, + "loss": 0.8533, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.5706035386100896, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6921, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.46084713291319324, + "learning_rate": 8.79365619028507e-06, + "loss": 0.7625, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5420705220649868, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7185, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.44589867765790675, + "learning_rate": 8.582307276841462e-06, + "loss": 0.6253, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.4607921796004629, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6485, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 1.0893025371996412, + "learning_rate": 8.37341524246672e-06, + "loss": 0.7809, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4811219887426921, + "learning_rate": 8.269892311900696e-06, + "loss": 0.807, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.4840226488388807, + "learning_rate": 8.166985701199582e-06, + "loss": 0.6789, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.5135873716292094, + "learning_rate": 8.064696101776358e-06, + "loss": 0.853, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.36562653186171645, + "learning_rate": 7.963024200898462e-06, + "loss": 0.6781, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.42593289151012326, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6166, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.5315582194594366, + "learning_rate": 7.761536223092458e-06, + "loss": 0.6649, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.6127137326307142, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7117, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.5303908062206815, + "learning_rate": 7.562527182833978e-06, + "loss": 0.8552, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.47838774189460087, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6406, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.3913538973837513, + "learning_rate": 7.366002428553153e-06, + "loss": 0.6891, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.5334598670020214, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6587, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.46587118676131073, + "learning_rate": 7.171967241914224e-06, + "loss": 0.6475, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4486855546434353, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6735, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.39660319163011865, + "learning_rate": 6.980426837673437e-06, + "loss": 0.5958, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.48039135395366067, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6764, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.39495013034193327, + "learning_rate": 6.791386363539065e-06, + "loss": 0.6095, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4717751676173904, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7248, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.5150801453634942, + "learning_rate": 6.604850900032955e-06, + "loss": 0.7773, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.6082428013992502, + "learning_rate": 6.512524116523633e-06, + "loss": 0.8776, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.4406367241712505, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6803, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.46665206191644454, + "learning_rate": 6.329755547632499e-06, + "loss": 0.671, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.5460304496293431, + "learning_rate": 6.239314990243339e-06, + "loss": 0.691, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.47705943421423336, + "learning_rate": 6.149504395842087e-06, + "loss": 0.7156, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.6375269617497463, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.6572, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4630754577437863, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6607, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.4469535198120172, + "learning_rate": 5.883858403607967e-06, + "loss": 0.6361, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5132241096703618, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.8124, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.41536852207540703, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.6163, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.40779479029246263, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6948, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.45953098037180146, + "learning_rate": 5.538519351897575e-06, + "loss": 0.7308, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.47649432472023995, + "learning_rate": 5.453769828241872e-06, + "loss": 0.7045, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.4450325910963035, + "learning_rate": 5.369655545525909e-06, + "loss": 0.7013, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.5813744234646867, + "learning_rate": 5.286177068899989e-06, + "loss": 0.8248, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.5214211863700414, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.7779, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.48349638498519676, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6961, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.5056775048112797, + "learning_rate": 5.039562062965508e-06, + "loss": 0.6622, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.5647896557856095, + "learning_rate": 4.95863237670956e-06, + "loss": 0.756, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.4349903928065448, + "learning_rate": 4.87834125814235e-06, + "loss": 0.693, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.5798443024117006, + "learning_rate": 4.798689246727006e-06, + "loss": 0.802, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.4267548379955475, + "learning_rate": 4.719676877632639e-06, + "loss": 0.6816, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.45202988249577536, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7188, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.44678846932757, + "learning_rate": 4.563573185591219e-06, + "loss": 0.6814, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.4836421448666474, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6817, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.5740021878837424, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.7962, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4784481331451972, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6667, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.4432807695617788, + "learning_rate": 4.259064579323302e-06, + "loss": 0.7471, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4177194996243855, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6628, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.4282931762857877, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.7124, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.4262061012697565, + "learning_rate": 4.037435632986786e-06, + "loss": 0.717, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.4514805942685446, + "learning_rate": 3.964848174174541e-06, + "loss": 0.6876, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.5303649954638886, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7667, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.4918631618165322, + "learning_rate": 3.821609474213983e-06, + "loss": 0.6502, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.5700142202967081, + "learning_rate": 3.750959195463466e-06, + "loss": 0.8474, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.4991967240869772, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.7511, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.40217138492730187, + "learning_rate": 3.611599153858214e-06, + "loss": 0.682, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.48229462822619174, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.7206, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.47550104145425826, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.7343, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.45188720769842533, + "learning_rate": 3.40741737109318e-06, + "loss": 0.7025, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.7668311673684182, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.8834, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.5669737602421924, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.7298, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5137507586726185, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6404, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.5034102659190928, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.8691, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.514897507309948, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6907, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.38352291014196305, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.6594, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5516929282475472, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7701, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.5482142752963403, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.759, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.5054509558775777, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.801, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.4659084921053555, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.761, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.47021240701730616, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7001, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.49700801293005603, + "learning_rate": 2.649217248223468e-06, + "loss": 0.8184, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.7469452107817106, + "learning_rate": 2.590275647868867e-06, + "loss": 0.7729, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.4532722798183533, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6464, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.40494768586241986, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.664, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.48079459836260613, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.7605, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.507345904652204, + "learning_rate": 2.3610579436393e-06, + "loss": 0.777, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.48410887979651696, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.7629, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5838788418864651, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7766, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.4743951487176639, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.8391, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.5560881695486783, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.8358, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.4701398204578433, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.729, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.6364855840237912, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7894, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.5991736751608101, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.8437, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.5354023370634079, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7529, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.6862494322593848, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.8039, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4778786680609419, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6964, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.49107623760081326, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.7291, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.4364131865010124, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6781, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.413089179648038, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6462, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.44454951240419877, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6908, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.5037902192703654, + "learning_rate": 1.595161589389449e-06, + "loss": 0.7678, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.3803449954705906, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6427, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.5063374888963099, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.7234, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4950748093427182, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7362, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.4298150720578157, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.6113, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.4484073192872167, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.702, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.48076142652847925, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.6509, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.5815289608293004, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7593, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.47033738739691816, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.7879, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.4095355712885498, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6214, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.5139860956680408, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.7745, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.5828420082000267, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.7234, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.45752920918524914, + "learning_rate": 1.089491988176017e-06, + "loss": 0.7287, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4996228279183593, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.722, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.4128316143700129, + "learning_rate": 1.014505010326583e-06, + "loss": 0.6079, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.6235720276309589, + "learning_rate": 9.780089980330642e-07, + "loss": 0.7944, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.5324177522890242, + "learning_rate": 9.421782985976068e-07, + "loss": 0.7151, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4373834013581542, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6744, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.48830621150963666, + "learning_rate": 8.725137967920738e-07, + "loss": 0.7439, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.5172978854452933, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7225, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.5055221153296886, + "learning_rate": 8.055133771652345e-07, + "loss": 0.7656, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.5802956841191637, + "learning_rate": 7.730127636723539e-07, + "loss": 0.8947, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.5737272556988163, + "learning_rate": 7.411788403743237e-07, + "loss": 0.76, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.43047610633564, + "learning_rate": 7.100118211581852e-07, + "loss": 0.7098, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.49059844294427335, + "learning_rate": 6.7951191543012e-07, + "loss": 0.7956, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.45196699568062104, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7491, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.460241222532305, + "learning_rate": 6.205142596505176e-07, + "loss": 0.7106, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3593871411948802, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6234, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.7025232576691512, + "learning_rate": 5.64187458615939e-07, + "loss": 0.8086, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.5658939556876554, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7155, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.48882639524211474, + "learning_rate": 5.105330261267916e-07, + "loss": 0.7347, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4601932878135786, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7215, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.4529043738632631, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.6856, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.501067794848288, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.664, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.679684097281984, + "learning_rate": 4.112469628438365e-07, + "loss": 0.8777, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.49247108010902707, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.646, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.7252832657307932, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.8622, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.5300950176234022, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.8567, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.8460453494721898, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.6174, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.4996024193406382, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.8069, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.4664523979748622, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.7181, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.47115617791424774, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6722, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.4420849651630586, + "learning_rate": 2.448018893333681e-07, + "loss": 0.719, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.48151919354073947, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.7406, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.4976385289428759, + "learning_rate": 2.098903854912515e-07, + "loss": 0.808, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.48447004710712116, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.7694, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.4476502108264484, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.7376, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.5955771747704717, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.9274, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.4079829047595218, + "learning_rate": 1.481139151579991e-07, + "loss": 0.7124, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.5028524878332806, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.728, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.4703890920466158, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6841, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.48867952784400126, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.7001, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.5451033805868799, + "learning_rate": 9.707157531134713e-08, + "loss": 0.7471, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.6460001731465164, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7885, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.45891982107820856, + "learning_rate": 7.557746412468758e-08, + "loss": 0.6772, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4222374108765135, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6913, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.47249096967220855, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.7293, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.6296538495733958, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7455, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.5041542749156558, + "learning_rate": 4.064624751394242e-08, + "loss": 0.6603, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.4879198104782223, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7456, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.533299422297126, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.6959, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.42419472398628383, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6748, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.5388702644620558, + "learning_rate": 1.646071422083395e-08, + "loss": 0.7351, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.45194480277494764, + "learning_rate": 1.209367398504746e-08, + "loss": 0.7659, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.5266213337050553, + "learning_rate": 8.398436437317969e-09, + "loss": 0.7255, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4521052692283909, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6672, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.4367416807703453, + "learning_rate": 3.023464202944748e-09, + "loss": 0.6763, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.5425671735354565, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.8357, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.39197805688782217, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.6252, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.636152187807064, + "learning_rate": 0.0, + "loss": 0.7529, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1013983240585216.0, + "train_loss": 0.8066569664001465, + "train_runtime": 18635.1825, + "train_samples_per_second": 1.073, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1013983240585216.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a94241f4cbc407ab7bd263a285354f3ced348326 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "q_proj", + "down_proj", + "up_proj", + "o_proj", + "v_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1d1a624e9e1e573417e2e92c0582dc8619a5d9bf --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:222d6020a814fe4a9d38c7187c52bcc5c8fab80c418dd58f37f19789d5d3258e +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..6abfdc94db754ee2253717394fccd92e378a5293 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48d577f56281a55d9ebce8783da6a9f998306862aa87ce27b012361b7d4c3a07 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..40a4398bb2113ddb8bdc3b4d774db352dfc0b074 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.8986589092020809, + "learning_rate": 5.263157894736842e-06, + "loss": 1.305, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.830293875624169, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.0948, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 1.0859176979111835, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.4743, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8491307507912119, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2398, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.821827783074679, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.3072, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.824226467468147, + "learning_rate": 3.157894736842105e-05, + "loss": 1.2926, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.7462316432208581, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.1826, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9694515614371708, + "learning_rate": 4.210526315789474e-05, + "loss": 1.329, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 1.355360401054164, + "learning_rate": 4.736842105263158e-05, + "loss": 1.2415, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 0.7877105211597762, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0667, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 1.1658824689297784, + "learning_rate": 5.789473684210527e-05, + "loss": 1.2091, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.9327083427649957, + "learning_rate": 6.31578947368421e-05, + "loss": 1.144, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8541289719054126, + "learning_rate": 6.842105263157895e-05, + "loss": 1.0759, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7901468926109594, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1089, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.9870845593100134, + "learning_rate": 7.894736842105263e-05, + "loss": 1.1508, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.8366002733618001, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9787, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.5647308493323178, + "learning_rate": 8.947368421052632e-05, + "loss": 0.8868, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5265349634968662, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9564, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.6688334553966441, + "learning_rate": 0.0001, + "loss": 0.9151, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.8331128891603933, + "learning_rate": 0.00010526315789473685, + "loss": 1.0021, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6671520676918071, + "learning_rate": 0.0001105263157894737, + "loss": 1.0317, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5400337392126402, + "learning_rate": 0.00011578947368421053, + "loss": 0.9324, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5516519660957107, + "learning_rate": 0.00012105263157894738, + "loss": 0.8616, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6051849477286368, + "learning_rate": 0.0001263157894736842, + "loss": 0.9653, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.5220444727960437, + "learning_rate": 0.00013157894736842108, + "loss": 0.8718, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6123955645188963, + "learning_rate": 0.0001368421052631579, + "loss": 0.9204, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.739824439009589, + "learning_rate": 0.00014210526315789474, + "loss": 0.9997, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 1.0035094814749785, + "learning_rate": 0.00014736842105263158, + "loss": 1.102, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.49200935834599646, + "learning_rate": 0.00015263157894736845, + "loss": 0.8157, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.5254439825147661, + "learning_rate": 0.00015789473684210527, + "loss": 0.8688, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.7579956292478108, + "learning_rate": 0.0001631578947368421, + "loss": 0.9638, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5490449797222368, + "learning_rate": 0.00016842105263157895, + "loss": 0.9614, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.7159607383012427, + "learning_rate": 0.0001736842105263158, + "loss": 1.1859, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5031259092114513, + "learning_rate": 0.00017894736842105264, + "loss": 0.8207, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.6530032773796935, + "learning_rate": 0.00018421052631578948, + "loss": 0.9945, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5311755174792404, + "learning_rate": 0.00018947368421052632, + "loss": 0.8765, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.5265268927327297, + "learning_rate": 0.00019473684210526317, + "loss": 0.8142, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.6594978793784297, + "learning_rate": 0.0002, + "loss": 0.9188, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.6108156444196295, + "learning_rate": 0.00019999966405802826, + "loss": 0.9996, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.5207524633266147, + "learning_rate": 0.00019999865623437013, + "loss": 0.8505, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.5945794453910999, + "learning_rate": 0.00019999697653579705, + "loss": 0.8842, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.6835069109971997, + "learning_rate": 0.00019999462497359466, + "loss": 1.0261, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.5822115055269951, + "learning_rate": 0.0001999916015635627, + "loss": 0.8708, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5409224848399561, + "learning_rate": 0.00019998790632601496, + "loss": 0.8429, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.6552700099296889, + "learning_rate": 0.00019998353928577919, + "loss": 1.0053, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.6259818352901533, + "learning_rate": 0.0001999785004721968, + "loss": 1.0065, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.5816629608848023, + "learning_rate": 0.0001999727899191228, + "loss": 0.9192, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5455571314604929, + "learning_rate": 0.00019996640766492543, + "loss": 0.9025, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.5678393942881697, + "learning_rate": 0.00019995935375248606, + "loss": 1.0135, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.583023033404638, + "learning_rate": 0.00019995162822919883, + "loss": 0.9791, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.5607331923054734, + "learning_rate": 0.00019994323114697022, + "loss": 0.8702, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5631397290394965, + "learning_rate": 0.00019993416256221895, + "loss": 0.9171, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.4958191335673494, + "learning_rate": 0.0001999244225358753, + "loss": 0.7935, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.6636439021012339, + "learning_rate": 0.00019991401113338104, + "loss": 0.8472, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.6049671251396802, + "learning_rate": 0.00019990292842468868, + "loss": 0.9366, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6812943564570575, + "learning_rate": 0.00019989117448426108, + "loss": 1.0725, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.5778111405788794, + "learning_rate": 0.0001998787493910712, + "loss": 0.8981, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5423108759849826, + "learning_rate": 0.00019986565322860115, + "loss": 0.8489, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.6029690474490851, + "learning_rate": 0.000199851886084842, + "loss": 0.92, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.527992273229674, + "learning_rate": 0.00019983744805229296, + "loss": 0.7874, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.5350764937310879, + "learning_rate": 0.00019982233922796085, + "loss": 0.9199, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5796700937405846, + "learning_rate": 0.00019980655971335945, + "loss": 0.8645, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.5215364379687121, + "learning_rate": 0.00019979010961450878, + "loss": 0.8547, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.6062129459008473, + "learning_rate": 0.00019977298904193437, + "loss": 0.8867, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.5605748343295668, + "learning_rate": 0.00019975519811066663, + "loss": 0.8111, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.576459556463628, + "learning_rate": 0.00019973673694024, + "loss": 0.8717, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.5324349668538816, + "learning_rate": 0.0001997176056546921, + "loss": 0.8282, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5138522744442616, + "learning_rate": 0.00019969780438256293, + "loss": 0.8646, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.5010041056712672, + "learning_rate": 0.0001996773332568941, + "loss": 0.812, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.7886740692553563, + "learning_rate": 0.0001996561924152278, + "loss": 1.0595, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.5457095602344805, + "learning_rate": 0.00019963438199960599, + "loss": 0.8222, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.49123538118979226, + "learning_rate": 0.0001996119021565693, + "loss": 0.7757, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.5115269662398529, + "learning_rate": 0.00019958875303715615, + "loss": 0.9046, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.6578445782195319, + "learning_rate": 0.0001995649347969019, + "loss": 0.9942, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.5954139274878782, + "learning_rate": 0.0001995404475958373, + "loss": 0.903, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.618133330783266, + "learning_rate": 0.00019951529159848805, + "loss": 0.9027, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.5209779171701726, + "learning_rate": 0.0001994894669738732, + "loss": 0.8011, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.5721584497274415, + "learning_rate": 0.00019946297389550433, + "loss": 0.8403, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.45441592205501147, + "learning_rate": 0.0001994358125413841, + "loss": 0.7716, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.5903525833772922, + "learning_rate": 0.00019940798309400526, + "loss": 0.8783, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.5303726745446902, + "learning_rate": 0.0001993794857403495, + "loss": 0.8185, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.772029898870653, + "learning_rate": 0.0001993503206718859, + "loss": 0.8899, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.641589294357296, + "learning_rate": 0.0001993204880845699, + "loss": 0.9059, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.6217613426147984, + "learning_rate": 0.00019928998817884182, + "loss": 0.9168, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.447177478722229, + "learning_rate": 0.00019925882115962568, + "loss": 0.809, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.552806211756663, + "learning_rate": 0.00019922698723632767, + "loss": 0.9089, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.464126069407049, + "learning_rate": 0.00019919448662283478, + "loss": 0.8101, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6040238156148237, + "learning_rate": 0.00019916131953751342, + "loss": 0.9611, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.6338051158624403, + "learning_rate": 0.00019912748620320794, + "loss": 0.9381, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.5623887909587395, + "learning_rate": 0.00019909298684723904, + "loss": 0.8903, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.5592043890991619, + "learning_rate": 0.00019905782170140238, + "loss": 0.9391, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4635024578463063, + "learning_rate": 0.00019902199100196697, + "loss": 0.8363, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.5898751617245477, + "learning_rate": 0.00019898549498967343, + "loss": 0.9501, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.5054290458812549, + "learning_rate": 0.00019894833390973266, + "loss": 0.8782, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.6134646974943796, + "learning_rate": 0.000198910508011824, + "loss": 0.9037, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5717677399296033, + "learning_rate": 0.00019887201755009357, + "loss": 0.9824, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.6370817907582318, + "learning_rate": 0.00019883286278315262, + "loss": 0.804, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.6284576519885868, + "learning_rate": 0.0001987930439740757, + "loss": 0.9213, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.5832943669530816, + "learning_rate": 0.00019875256139039902, + "loss": 0.9025, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.6280556178077573, + "learning_rate": 0.00019871141530411853, + "loss": 0.8009, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.6784617300670089, + "learning_rate": 0.00019866960599168826, + "loss": 0.9593, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5699631314098912, + "learning_rate": 0.0001986271337340182, + "loss": 0.9014, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.5005466864303845, + "learning_rate": 0.0001985839988164726, + "loss": 0.8087, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5819193260487425, + "learning_rate": 0.00019854020152886814, + "loss": 0.9133, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.7866528561879383, + "learning_rate": 0.00019849574216547171, + "loss": 1.0283, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5642528780685404, + "learning_rate": 0.0001984506210249986, + "loss": 0.8159, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.6208922393037105, + "learning_rate": 0.00019840483841061058, + "loss": 0.9366, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.675109352046293, + "learning_rate": 0.00019835839462991361, + "loss": 0.9294, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.641474898133526, + "learning_rate": 0.00019831128999495606, + "loss": 0.925, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.8968056261287591, + "learning_rate": 0.00019826352482222638, + "loss": 0.9321, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.6247995474162983, + "learning_rate": 0.0001982150994326511, + "loss": 0.9779, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5600887864975691, + "learning_rate": 0.00019816601415159263, + "loss": 0.9847, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.521242776088519, + "learning_rate": 0.0001981162693088471, + "loss": 0.8781, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.6125366480896934, + "learning_rate": 0.0001980658652386421, + "loss": 0.8999, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.6533464104827684, + "learning_rate": 0.0001980148022796345, + "loss": 0.864, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.552235681308966, + "learning_rate": 0.00019796308077490817, + "loss": 0.9333, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.4431325972855391, + "learning_rate": 0.00019791070107197153, + "loss": 0.8051, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.5453940866305156, + "learning_rate": 0.00019785766352275542, + "loss": 0.9783, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.48508766719121615, + "learning_rate": 0.0001978039684836106, + "loss": 0.7914, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.5031532876965941, + "learning_rate": 0.00019774961631530545, + "loss": 0.8062, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.580797968287359, + "learning_rate": 0.0001976946073830234, + "loss": 0.8684, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.49221239109286524, + "learning_rate": 0.00019763894205636072, + "loss": 0.9073, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.5681710105726208, + "learning_rate": 0.00019758262070932375, + "loss": 0.8279, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5221819706798567, + "learning_rate": 0.00019752564372032657, + "loss": 0.8291, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.5050274477718419, + "learning_rate": 0.00019746801147218842, + "loss": 0.8889, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.7052424313505582, + "learning_rate": 0.00019740972435213115, + "loss": 0.9742, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.5825164051491847, + "learning_rate": 0.00019735078275177654, + "loss": 0.9465, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.7561470692223455, + "learning_rate": 0.00019729118706714375, + "loss": 1.0117, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.5772282257061512, + "learning_rate": 0.00019723093769864663, + "loss": 0.7795, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.5297039548484948, + "learning_rate": 0.00019717003505109095, + "loss": 0.8911, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.6347690410009654, + "learning_rate": 0.0001971084795336719, + "loss": 1.0346, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.7027346281966333, + "learning_rate": 0.00019704627155997108, + "loss": 0.9865, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.6417895499509113, + "learning_rate": 0.00019698341154795389, + "loss": 0.9555, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5397045126751981, + "learning_rate": 0.00019691989991996663, + "loss": 0.9009, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.8677659845405544, + "learning_rate": 0.00019685573710273376, + "loss": 0.9189, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.7541996003157508, + "learning_rate": 0.0001967909235273549, + "loss": 1.0182, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.4763386277168582, + "learning_rate": 0.00019672545962930215, + "loss": 0.7819, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.601268655373242, + "learning_rate": 0.00019665934584841682, + "loss": 0.9141, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.5119877362673206, + "learning_rate": 0.00019659258262890683, + "loss": 0.8581, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.5741522809403841, + "learning_rate": 0.00019652517041934356, + "loss": 0.8725, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.5046784102658995, + "learning_rate": 0.00019645710967265882, + "loss": 0.8491, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.7807045825377874, + "learning_rate": 0.00019638840084614182, + "loss": 0.8545, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.5587935402064526, + "learning_rate": 0.00019631904440143612, + "loss": 0.9317, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5252305623416647, + "learning_rate": 0.00019624904080453655, + "loss": 0.9038, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.7861363392166195, + "learning_rate": 0.00019617839052578603, + "loss": 0.9387, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.5612971617745024, + "learning_rate": 0.00019610709403987246, + "loss": 0.8797, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.7073398481937953, + "learning_rate": 0.0001960351518258255, + "loss": 1.0086, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.49866728031534496, + "learning_rate": 0.00019596256436701324, + "loss": 0.7951, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.6099236862660113, + "learning_rate": 0.00019588933215113926, + "loss": 0.9616, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.6027224592471235, + "learning_rate": 0.000195815455670239, + "loss": 0.8058, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.5781441720245145, + "learning_rate": 0.00019574093542067673, + "loss": 0.9271, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5813183201401009, + "learning_rate": 0.00019566577190314197, + "loss": 0.8407, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.4872919673337898, + "learning_rate": 0.0001955899656226464, + "loss": 0.8195, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.6324964991159663, + "learning_rate": 0.0001955135170885202, + "loss": 0.8517, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.588729212998381, + "learning_rate": 0.0001954364268144088, + "loss": 0.878, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.6264412787638441, + "learning_rate": 0.00019535869531826937, + "loss": 0.8654, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.57769459491213, + "learning_rate": 0.00019528032312236736, + "loss": 0.8709, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.6357337602139279, + "learning_rate": 0.00019520131075327298, + "loss": 0.9951, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.9163531621840607, + "learning_rate": 0.00019512165874185767, + "loss": 1.0036, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.541890790204263, + "learning_rate": 0.00019504136762329047, + "loss": 0.8461, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.6106261496261902, + "learning_rate": 0.0001949604379370345, + "loss": 0.9491, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.45323037858604226, + "learning_rate": 0.00019487887022684336, + "loss": 0.8413, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.5103765969188921, + "learning_rate": 0.00019479666504075736, + "loss": 0.8609, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5510921340396624, + "learning_rate": 0.00019471382293110003, + "loss": 0.8158, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.6175356227388614, + "learning_rate": 0.0001946303444544741, + "loss": 1.0139, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.6370307555105761, + "learning_rate": 0.00019454623017175812, + "loss": 0.8622, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.6811171081420755, + "learning_rate": 0.00019446148064810242, + "loss": 0.9372, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5598562796566181, + "learning_rate": 0.00019437609645292546, + "loss": 0.8956, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.4617016786400735, + "learning_rate": 0.00019429007815990993, + "loss": 0.8088, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.6102848905343905, + "learning_rate": 0.0001942034263469989, + "loss": 0.882, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.6962296432424184, + "learning_rate": 0.00019411614159639204, + "loss": 0.9596, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5750923928215433, + "learning_rate": 0.00019402822449454153, + "loss": 0.8366, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.6265326368878519, + "learning_rate": 0.00019393967563214833, + "loss": 0.7805, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5381935667615324, + "learning_rate": 0.00019385049560415794, + "loss": 0.847, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.47642701981179963, + "learning_rate": 0.00019376068500975667, + "loss": 0.7575, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5185766673031289, + "learning_rate": 0.00019367024445236754, + "loss": 0.7912, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.5960095358549333, + "learning_rate": 0.000193579174539646, + "loss": 0.9236, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.7084179815147453, + "learning_rate": 0.00019348747588347637, + "loss": 1.0528, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.5365346049625459, + "learning_rate": 0.00019339514909996706, + "loss": 0.8705, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.44830967852358555, + "learning_rate": 0.00019330219480944694, + "loss": 0.8658, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.5799425229989558, + "learning_rate": 0.00019320861363646095, + "loss": 0.8809, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.5266534061420898, + "learning_rate": 0.00019311440620976597, + "loss": 0.8845, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.49940366689432303, + "learning_rate": 0.00019301957316232658, + "loss": 0.8514, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5502261680204071, + "learning_rate": 0.0001929241151313108, + "loss": 0.9158, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.6852350642167058, + "learning_rate": 0.0001928280327580858, + "loss": 0.9311, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.460573911082788, + "learning_rate": 0.00019273132668821364, + "loss": 0.7992, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.5827491189696308, + "learning_rate": 0.00019263399757144683, + "loss": 0.8715, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.47097245497372964, + "learning_rate": 0.00019253604606172417, + "loss": 0.8452, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.6304041203761992, + "learning_rate": 0.000192437472817166, + "loss": 0.953, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.5043229786824451, + "learning_rate": 0.00019233827850007027, + "loss": 0.8501, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.5534866676999771, + "learning_rate": 0.00019223846377690754, + "loss": 0.8797, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.6283272982521915, + "learning_rate": 0.00019213802931831696, + "loss": 0.8713, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.48771113456670234, + "learning_rate": 0.00019203697579910154, + "loss": 0.8375, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.6286083414158488, + "learning_rate": 0.00019193530389822363, + "loss": 0.9169, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.5115509368730083, + "learning_rate": 0.00019183301429880043, + "loss": 0.9247, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.47098215222574896, + "learning_rate": 0.00019173010768809933, + "loss": 0.8131, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.5556862923247421, + "learning_rate": 0.00019162658475753327, + "loss": 0.8602, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4841783174408543, + "learning_rate": 0.0001915224462026563, + "loss": 0.8302, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.6708171063402733, + "learning_rate": 0.00019141769272315858, + "loss": 0.9678, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.4849407325306258, + "learning_rate": 0.00019131232502286188, + "loss": 0.8129, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.47576669125357923, + "learning_rate": 0.00019120634380971496, + "loss": 0.8517, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4855825169345397, + "learning_rate": 0.0001910997497957885, + "loss": 0.8717, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.557922956246443, + "learning_rate": 0.0001909925436972706, + "loss": 0.9238, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4979915047932215, + "learning_rate": 0.00019088472623446183, + "loss": 0.8587, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.8209533560536886, + "learning_rate": 0.00019077629813177036, + "loss": 1.1452, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.6042494889849994, + "learning_rate": 0.00019066726011770726, + "loss": 0.9018, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.5365944913148071, + "learning_rate": 0.00019055761292488142, + "loss": 0.9367, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4880610297558772, + "learning_rate": 0.0001904473572899947, + "loss": 0.9145, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.5096484059564002, + "learning_rate": 0.00019033649395383702, + "loss": 0.8918, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.540369934145095, + "learning_rate": 0.00019022502366128135, + "loss": 0.8379, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.5280688976883253, + "learning_rate": 0.00019011294716127867, + "loss": 0.8826, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5988073761605279, + "learning_rate": 0.00019000026520685302, + "loss": 0.8368, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.47933975417162, + "learning_rate": 0.0001898869785550963, + "loss": 0.8315, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.5192695982409684, + "learning_rate": 0.0001897730879671634, + "loss": 0.9, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.5312935651865488, + "learning_rate": 0.00018965859420826684, + "loss": 0.7853, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5376996711618321, + "learning_rate": 0.00018954349804767184, + "loss": 0.9059, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.6103998381142328, + "learning_rate": 0.00018942780025869098, + "loss": 1.0594, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.6018626550476088, + "learning_rate": 0.00018931150161867916, + "loss": 0.9539, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.49601363247290925, + "learning_rate": 0.00018919460290902826, + "loss": 0.893, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.7674521964202011, + "learning_rate": 0.00018907710491516199, + "loss": 0.9014, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.5336013967413517, + "learning_rate": 0.0001889590084265304, + "loss": 0.7981, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.7835491538209578, + "learning_rate": 0.0001888403142366049, + "loss": 0.8743, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.6692990893335212, + "learning_rate": 0.0001887210231428727, + "loss": 0.797, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5554694978076148, + "learning_rate": 0.00018860113594683148, + "loss": 0.9045, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.7018575579850007, + "learning_rate": 0.0001884806534539841, + "loss": 0.9661, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5732712171563312, + "learning_rate": 0.00018835957647383303, + "loss": 0.8338, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.6257272102104295, + "learning_rate": 0.0001882379058198751, + "loss": 0.9729, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5594595787785903, + "learning_rate": 0.00018811564230959588, + "loss": 0.8213, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.6979267188451503, + "learning_rate": 0.00018799278676446423, + "loss": 0.9913, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.5994165349249353, + "learning_rate": 0.00018786934000992688, + "loss": 0.9171, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.4716135434126101, + "learning_rate": 0.00018774530287540278, + "loss": 0.6811, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.7599573528458343, + "learning_rate": 0.00018762067619427746, + "loss": 0.9036, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.5150015644946726, + "learning_rate": 0.00018749546080389757, + "loss": 0.7706, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5200450168942828, + "learning_rate": 0.00018736965754556528, + "loss": 0.7307, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.47433937688998185, + "learning_rate": 0.00018724326726453244, + "loss": 0.79, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.8329159675900423, + "learning_rate": 0.00018711629080999504, + "loss": 1.0186, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.575539326963917, + "learning_rate": 0.00018698872903508755, + "loss": 0.9364, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4530633216315929, + "learning_rate": 0.00018686058279687698, + "loss": 0.7354, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.5869814040574673, + "learning_rate": 0.0001867318529563574, + "loss": 0.8727, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.5449803815964556, + "learning_rate": 0.00018660254037844388, + "loss": 0.8626, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.4894517147542963, + "learning_rate": 0.00018647264593196688, + "loss": 0.8402, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.6292559711803009, + "learning_rate": 0.00018634217048966637, + "loss": 0.9694, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.6147704342501734, + "learning_rate": 0.00018621111492818585, + "loss": 0.8603, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5431473970139671, + "learning_rate": 0.0001860794801280666, + "loss": 0.8142, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.5496557815725536, + "learning_rate": 0.00018594726697374175, + "loss": 0.7979, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5072773852005839, + "learning_rate": 0.0001858144763535302, + "loss": 0.8802, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.6101453917715832, + "learning_rate": 0.0001856811091596308, + "loss": 0.913, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.45322241352839027, + "learning_rate": 0.0001855471662881164, + "loss": 0.8332, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.554767308131564, + "learning_rate": 0.00018541264863892754, + "loss": 0.7633, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.5912783933770612, + "learning_rate": 0.00018527755711586678, + "loss": 0.867, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.5891468598140162, + "learning_rate": 0.00018514189262659235, + "loss": 0.9687, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4997613391149276, + "learning_rate": 0.00018500565608261214, + "loss": 0.7937, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.5028289046293992, + "learning_rate": 0.00018486884839927768, + "loss": 0.7867, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5815838698745088, + "learning_rate": 0.00018473147049577774, + "loss": 0.8327, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.49985488612870416, + "learning_rate": 0.0001845935232951325, + "loss": 0.7991, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.6216224645165545, + "learning_rate": 0.00018445500772418697, + "loss": 1.0184, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.6116470706461582, + "learning_rate": 0.00018431592471360503, + "loss": 1.0301, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.5531047775225604, + "learning_rate": 0.00018417627519786315, + "loss": 0.8914, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.5079295779498472, + "learning_rate": 0.000184036060115244, + "loss": 0.9567, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.4454675416660914, + "learning_rate": 0.00018389528040783012, + "loss": 0.7465, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.5493225471484111, + "learning_rate": 0.00018375393702149787, + "loss": 0.9337, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5991957073365373, + "learning_rate": 0.00018361203090591071, + "loss": 0.9526, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.5348809803871967, + "learning_rate": 0.00018346956301451304, + "loss": 0.7977, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5186383297042673, + "learning_rate": 0.00018332653430452376, + "loss": 0.8872, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.5960123170128296, + "learning_rate": 0.00018318294573692985, + "loss": 0.9598, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.6515494979551818, + "learning_rate": 0.00018303879827647975, + "loss": 0.8145, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.6080928936698701, + "learning_rate": 0.0001828940928916772, + "loss": 0.9205, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.5504865158312073, + "learning_rate": 0.00018274883055477436, + "loss": 0.8171, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.7062526284859021, + "learning_rate": 0.00018260301224176558, + "loss": 0.9854, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.4843625073655576, + "learning_rate": 0.00018245663893238075, + "loss": 0.8036, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.521267876354061, + "learning_rate": 0.00018230971161007853, + "loss": 0.7845, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5890747746964897, + "learning_rate": 0.00018216223126204007, + "loss": 0.8138, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.5239600933357469, + "learning_rate": 0.00018201419887916214, + "loss": 0.8446, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.7687056630992152, + "learning_rate": 0.00018186561545605054, + "loss": 1.1336, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.5885758299804096, + "learning_rate": 0.00018171648199101346, + "loss": 0.8736, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4545727094517842, + "learning_rate": 0.00018156679948605467, + "loss": 0.8202, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.4853582138003786, + "learning_rate": 0.00018141656894686689, + "loss": 0.8451, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.6297161632990195, + "learning_rate": 0.00018126579138282503, + "loss": 0.9326, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.6520961746658375, + "learning_rate": 0.00018111446780697929, + "loss": 0.8899, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.5798076905396926, + "learning_rate": 0.0001809625992360485, + "loss": 0.9056, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.6156072852450034, + "learning_rate": 0.00018081018669041324, + "loss": 0.8909, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5732894197464338, + "learning_rate": 0.00018065723119410884, + "loss": 0.9058, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.5727194681440995, + "learning_rate": 0.00018050373377481878, + "loss": 0.9331, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.6256855912463656, + "learning_rate": 0.00018034969546386757, + "loss": 0.9958, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.5141184537751282, + "learning_rate": 0.0001801951172962139, + "loss": 0.8346, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.5626344634126693, + "learning_rate": 0.0001800400003104436, + "loss": 0.8372, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.44884469985131437, + "learning_rate": 0.0001798843455487629, + "loss": 0.8172, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5857688983623034, + "learning_rate": 0.00017972815405699103, + "loss": 0.8494, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.5818393340233482, + "learning_rate": 0.00017957142688455362, + "loss": 0.9132, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.5189677144113589, + "learning_rate": 0.00017941416508447536, + "loss": 0.7677, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.5373850787765176, + "learning_rate": 0.00017925636971337304, + "loss": 0.9232, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5336004325307646, + "learning_rate": 0.0001790980418314484, + "loss": 0.955, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.4479741697852256, + "learning_rate": 0.00017893918250248104, + "loss": 0.7711, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5892862205502235, + "learning_rate": 0.00017877979279382135, + "loss": 0.9122, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.7034529147174017, + "learning_rate": 0.00017861987377638312, + "loss": 0.9757, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4920748092757167, + "learning_rate": 0.0001784594265246366, + "loss": 0.8567, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.6935746977422499, + "learning_rate": 0.0001782984521166011, + "loss": 0.9767, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.6857207812627873, + "learning_rate": 0.0001781369516338378, + "loss": 0.9712, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.5575033476384434, + "learning_rate": 0.00017797492616144256, + "loss": 0.8163, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.5723045438689641, + "learning_rate": 0.00017781237678803847, + "loss": 0.9316, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.47386175233905387, + "learning_rate": 0.00017764930460576866, + "loss": 0.8197, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.5586605608011292, + "learning_rate": 0.000177485710710289, + "loss": 0.8275, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.6728938504797903, + "learning_rate": 0.00017732159620076053, + "loss": 0.9101, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.49103317186866774, + "learning_rate": 0.00017715696217984235, + "loss": 0.8359, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.49217192479223953, + "learning_rate": 0.00017699180975368396, + "loss": 0.7596, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.5218476405354987, + "learning_rate": 0.00017682614003191807, + "loss": 0.7309, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.608874260267828, + "learning_rate": 0.00017665995412765285, + "loss": 0.8727, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5522196870339003, + "learning_rate": 0.00017649325315746478, + "loss": 0.9198, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.7229557914044292, + "learning_rate": 0.00017632603824139085, + "loss": 0.979, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.5348698413188704, + "learning_rate": 0.0001761583105029213, + "loss": 0.8814, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.5622567164554441, + "learning_rate": 0.0001759900710689918, + "loss": 0.8156, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5093756906036769, + "learning_rate": 0.00017582132106997616, + "loss": 0.8298, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.46793398379285495, + "learning_rate": 0.00017565206163967846, + "loss": 0.7569, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4688320329106946, + "learning_rate": 0.00017548229391532572, + "loss": 0.7834, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.5708290914468318, + "learning_rate": 0.00017531201903755994, + "loss": 0.8218, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4704229420649958, + "learning_rate": 0.00017514123815043074, + "loss": 0.8628, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.4573714211278024, + "learning_rate": 0.00017496995240138744, + "loss": 0.8404, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.5018743555661164, + "learning_rate": 0.00017479816294127152, + "loss": 0.8737, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.524828796964741, + "learning_rate": 0.00017462587092430875, + "loss": 0.8281, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.579486821084244, + "learning_rate": 0.0001744530775081015, + "loss": 0.85, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.5740604251572758, + "learning_rate": 0.00017427978385362112, + "loss": 0.8501, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4379389482626325, + "learning_rate": 0.0001741059911251997, + "loss": 0.7899, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.5143736514553539, + "learning_rate": 0.0001739317004905227, + "loss": 0.8614, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4787207925108045, + "learning_rate": 0.000173756913120621, + "loss": 0.8104, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.4652530210510522, + "learning_rate": 0.00017358163018986282, + "loss": 0.7544, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.439584122847283, + "learning_rate": 0.00017340585287594604, + "loss": 0.7797, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.5785898710901757, + "learning_rate": 0.00017322958235989016, + "loss": 0.8617, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5178898755449122, + "learning_rate": 0.0001730528198260285, + "loss": 0.7682, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.4952121878341345, + "learning_rate": 0.00017287556646200018, + "loss": 0.7104, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.5343104495072938, + "learning_rate": 0.00017269782345874203, + "loss": 0.7876, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.4808948891989481, + "learning_rate": 0.00017251959201048083, + "loss": 0.7816, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.6958866218686733, + "learning_rate": 0.00017234087331472497, + "loss": 0.9808, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.5846119691054286, + "learning_rate": 0.00017216166857225674, + "loss": 0.8052, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.46570870144186904, + "learning_rate": 0.00017198197898712404, + "loss": 0.8445, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.7693973221460947, + "learning_rate": 0.00017180180576663228, + "loss": 0.9614, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4704975865295557, + "learning_rate": 0.00017162115012133643, + "loss": 0.8044, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.4206300887289306, + "learning_rate": 0.00017144001326503273, + "loss": 0.7798, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.46078013380689686, + "learning_rate": 0.00017125839641475072, + "loss": 0.8296, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.4442810770381299, + "learning_rate": 0.00017107630079074478, + "loss": 0.7575, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.7175217922445156, + "learning_rate": 0.00017089372761648616, + "loss": 0.9168, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.6585571860135052, + "learning_rate": 0.00017071067811865476, + "loss": 0.9008, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.5223436809700848, + "learning_rate": 0.00017052715352713075, + "loss": 0.7575, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.46856118841401995, + "learning_rate": 0.00017034315507498635, + "loss": 0.8129, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.43582582860860897, + "learning_rate": 0.00017015868399847768, + "loss": 0.7806, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.534652749247743, + "learning_rate": 0.00016997374153703625, + "loss": 0.7925, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.5343850961920595, + "learning_rate": 0.00016978832893326074, + "loss": 0.9008, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.5018036235407817, + "learning_rate": 0.00016960244743290868, + "loss": 0.8923, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5433844630930597, + "learning_rate": 0.00016941609828488807, + "loss": 0.8905, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.45363314802800025, + "learning_rate": 0.00016922928274124886, + "loss": 0.7348, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.5800104410459153, + "learning_rate": 0.0001690420020571747, + "loss": 0.897, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.3607190442404761, + "learning_rate": 0.00016885425749097444, + "loss": 0.6753, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5796044387108275, + "learning_rate": 0.0001686660503040737, + "loss": 0.79, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.5288070170976834, + "learning_rate": 0.00016847738176100632, + "loss": 0.8346, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4642039164416691, + "learning_rate": 0.00016828825312940592, + "loss": 0.8436, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.47729002802695314, + "learning_rate": 0.0001680986656799975, + "loss": 0.8802, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.6752672490776792, + "learning_rate": 0.0001679086206865886, + "loss": 1.0189, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.5408019204341415, + "learning_rate": 0.00016771811942606108, + "loss": 0.8333, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.46012779574926227, + "learning_rate": 0.00016752716317836229, + "loss": 0.77, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.44564761511274714, + "learning_rate": 0.00016733575322649657, + "loss": 0.7955, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.4397975238603909, + "learning_rate": 0.0001671438908565167, + "loss": 0.7309, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.4601391505956761, + "learning_rate": 0.00016695157735751513, + "loss": 0.7598, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.43078743526528973, + "learning_rate": 0.00016675881402161536, + "loss": 0.7514, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.6943844358517226, + "learning_rate": 0.0001665656021439633, + "loss": 0.9688, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.6199289848077439, + "learning_rate": 0.0001663719430227186, + "loss": 0.7856, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.46086670594540174, + "learning_rate": 0.00016617783795904565, + "loss": 0.8119, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4933704290625319, + "learning_rate": 0.00016598328825710533, + "loss": 0.8066, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.6226839151315376, + "learning_rate": 0.00016578829522404583, + "loss": 0.9057, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.47510078911012704, + "learning_rate": 0.000165592860169994, + "loss": 0.7527, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.5487304743269249, + "learning_rate": 0.00016539698440804661, + "loss": 0.9311, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.5243769562453834, + "learning_rate": 0.00016520066925426144, + "loss": 0.7744, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.6060607560363251, + "learning_rate": 0.0001650039160276485, + "loss": 0.9562, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4447612102652895, + "learning_rate": 0.0001648067260501611, + "loss": 0.799, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.5043201921155803, + "learning_rate": 0.0001646091006466871, + "loss": 0.777, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.6474988233382649, + "learning_rate": 0.0001644110411450398, + "loss": 0.9045, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.4340313396759206, + "learning_rate": 0.00016421254887594917, + "loss": 0.7521, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.5615075768410082, + "learning_rate": 0.00016401362517305296, + "loss": 0.9001, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.5607387146117787, + "learning_rate": 0.00016381427137288754, + "loss": 0.8576, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4731339230325628, + "learning_rate": 0.00016361448881487914, + "loss": 0.768, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.49559062627473566, + "learning_rate": 0.0001634142788413346, + "loss": 0.8063, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.4455448205556065, + "learning_rate": 0.00016321364279743266, + "loss": 0.803, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.5188763316404755, + "learning_rate": 0.00016301258203121462, + "loss": 0.8693, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4788968930738808, + "learning_rate": 0.0001628110978935756, + "loss": 0.8193, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.565510388051632, + "learning_rate": 0.00016260919173825508, + "loss": 0.7964, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.456549323203182, + "learning_rate": 0.00016240686492182804, + "loss": 0.798, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.43220854449176305, + "learning_rate": 0.00016220411880369601, + "loss": 0.7546, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.5894674924180361, + "learning_rate": 0.00016200095474607753, + "loss": 0.8805, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.4867511866315462, + "learning_rate": 0.00016179737411399926, + "loss": 0.8065, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.45750857416095064, + "learning_rate": 0.00016159337827528685, + "loss": 0.8482, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.51811947034319, + "learning_rate": 0.00016138896860055555, + "loss": 0.852, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.542414398039288, + "learning_rate": 0.0001611841464632011, + "loss": 0.7855, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.4177555443021668, + "learning_rate": 0.00016097891323939062, + "loss": 0.7254, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.5122162363581776, + "learning_rate": 0.0001607732703080532, + "loss": 0.7879, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.521585064075366, + "learning_rate": 0.00016056721905087056, + "loss": 0.8425, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.5337676365319911, + "learning_rate": 0.00016036076085226814, + "loss": 0.8443, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.5532009911621519, + "learning_rate": 0.00016015389709940538, + "loss": 0.8758, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5728067440563523, + "learning_rate": 0.0001599466291821666, + "loss": 0.7819, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.5210103844247099, + "learning_rate": 0.0001597389584931517, + "loss": 0.8325, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.5087466797181295, + "learning_rate": 0.0001595308864276666, + "loss": 0.8146, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.41931715800933833, + "learning_rate": 0.0001593224143837142, + "loss": 0.7942, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.4901933553582932, + "learning_rate": 0.0001591135437619847, + "loss": 0.865, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.4765670951576811, + "learning_rate": 0.00015890427596584617, + "loss": 0.8184, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.46904520645493447, + "learning_rate": 0.0001586946124013354, + "loss": 0.8152, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.4959381631072489, + "learning_rate": 0.00015848455447714822, + "loss": 0.8772, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.43533764493862964, + "learning_rate": 0.0001582741036046301, + "loss": 0.8032, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.5203866447099498, + "learning_rate": 0.00015806326119776663, + "loss": 0.8453, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.49538698755881333, + "learning_rate": 0.00015785202867317407, + "loss": 0.7219, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.5626546685841983, + "learning_rate": 0.00015764040745008988, + "loss": 0.763, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.5463990462052876, + "learning_rate": 0.00015742839895036305, + "loss": 0.8225, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.6738808055422155, + "learning_rate": 0.00015721600459844468, + "loss": 0.9124, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.5566517665237946, + "learning_rate": 0.00015700322582137827, + "loss": 0.857, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.5818652259183016, + "learning_rate": 0.00015679006404879033, + "loss": 0.8892, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5926880656892722, + "learning_rate": 0.0001565765207128805, + "loss": 0.9134, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.6825984184560483, + "learning_rate": 0.00015636259724841222, + "loss": 0.9489, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.44950861810545384, + "learning_rate": 0.0001561482950927029, + "loss": 0.7511, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.5198359021561324, + "learning_rate": 0.00015593361568561428, + "loss": 0.8457, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5696549519021159, + "learning_rate": 0.00015571856046954285, + "loss": 0.7875, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.546694860620974, + "learning_rate": 0.0001555031308894101, + "loss": 0.8445, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5380282710970803, + "learning_rate": 0.00015528732839265272, + "loss": 0.8209, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.5937082015021489, + "learning_rate": 0.0001550711544292131, + "loss": 0.89, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.5287257831042314, + "learning_rate": 0.0001548546104515294, + "loss": 0.8345, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.6103183473899669, + "learning_rate": 0.00015463769791452574, + "loss": 0.8317, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5094586387221397, + "learning_rate": 0.00015442041827560274, + "loss": 0.7889, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.6107261454152543, + "learning_rate": 0.00015420277299462736, + "loss": 0.9488, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.6065117264413042, + "learning_rate": 0.00015398476353392323, + "loss": 0.9215, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.7053169198061089, + "learning_rate": 0.00015376639135826107, + "loss": 1.0075, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4228898589354549, + "learning_rate": 0.00015354765793484834, + "loss": 0.8079, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.487172024547341, + "learning_rate": 0.00015332856473331978, + "loss": 0.8748, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.47144041066438463, + "learning_rate": 0.00015310911322572753, + "loss": 0.732, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.6818675806798845, + "learning_rate": 0.00015288930488653094, + "loss": 0.9766, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.48689971024583084, + "learning_rate": 0.000152669141192587, + "loss": 0.8083, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.47651144680168256, + "learning_rate": 0.0001524486236231402, + "loss": 0.8149, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5814637437027524, + "learning_rate": 0.00015222775365981273, + "loss": 0.8853, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.4746640362343065, + "learning_rate": 0.00015200653278659432, + "loss": 0.8162, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.40043688621775886, + "learning_rate": 0.00015178496248983254, + "loss": 0.7352, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.4833429780552123, + "learning_rate": 0.00015156304425822267, + "loss": 0.7851, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4239299991049108, + "learning_rate": 0.00015134077958279765, + "loss": 0.7385, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.5172926681098085, + "learning_rate": 0.00015111816995691809, + "loss": 0.8414, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5120086530270151, + "learning_rate": 0.00015089521687626243, + "loss": 0.8446, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.5669471654453373, + "learning_rate": 0.00015067192183881658, + "loss": 0.7746, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.5054731462512156, + "learning_rate": 0.000150448286344864, + "loss": 0.7986, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.4352437564325993, + "learning_rate": 0.00015022431189697568, + "loss": 0.7217, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.5675270179816488, + "learning_rate": 0.00015000000000000001, + "loss": 0.8416, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.4750355240151204, + "learning_rate": 0.0001497753521610526, + "loss": 0.8545, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.5129569536686879, + "learning_rate": 0.00014955036988950618, + "loss": 0.754, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.6952488173803737, + "learning_rate": 0.00014932505469698052, + "loss": 0.8875, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.460637684999673, + "learning_rate": 0.00014909940809733222, + "loss": 0.7656, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.4930923924746972, + "learning_rate": 0.0001488734316066446, + "loss": 0.8337, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5601297336887241, + "learning_rate": 0.00014864712674321734, + "loss": 0.7948, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.4207991716017091, + "learning_rate": 0.0001484204950275565, + "loss": 0.7117, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.5585803045272063, + "learning_rate": 0.00014819353798236427, + "loss": 0.7722, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.4701896450197255, + "learning_rate": 0.00014796625713252848, + "loss": 0.8381, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.7171183686662301, + "learning_rate": 0.00014773865400511272, + "loss": 0.8059, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.5987006577708592, + "learning_rate": 0.00014751073012934587, + "loss": 0.9456, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.46718902609560226, + "learning_rate": 0.00014728248703661182, + "loss": 0.7905, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.4259181214410821, + "learning_rate": 0.0001470539262604393, + "loss": 0.7584, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.6194362987661196, + "learning_rate": 0.00014682504933649144, + "loss": 0.94, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.4595245370288295, + "learning_rate": 0.00014659585780255556, + "loss": 0.8245, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5224041089925482, + "learning_rate": 0.00014636635319853275, + "loss": 0.8764, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.5138752347477095, + "learning_rate": 0.0001461365370664276, + "loss": 0.8138, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.49270360152003395, + "learning_rate": 0.00014590641095033787, + "loss": 0.82, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.5344931409250059, + "learning_rate": 0.00014567597639644387, + "loss": 0.873, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.44632385553108306, + "learning_rate": 0.00014544523495299842, + "loss": 0.8228, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.444515202856901, + "learning_rate": 0.00014521418817031628, + "loss": 0.7699, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.520110469642323, + "learning_rate": 0.0001449828376007636, + "loss": 0.7739, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.5023472914845316, + "learning_rate": 0.00014475118479874774, + "loss": 0.7977, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.597348091144304, + "learning_rate": 0.0001445192313207067, + "loss": 0.8887, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.4302257534061434, + "learning_rate": 0.0001442869787250987, + "loss": 0.7073, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5753588957288104, + "learning_rate": 0.0001440544285723915, + "loss": 0.7243, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.4255194197110321, + "learning_rate": 0.00014382158242505234, + "loss": 0.7516, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.49607289838443813, + "learning_rate": 0.00014358844184753712, + "loss": 0.8165, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.4420078578976846, + "learning_rate": 0.00014335500840627986, + "loss": 0.7923, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.6061627696050104, + "learning_rate": 0.00014312128366968243, + "loss": 0.8236, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.5509076824112138, + "learning_rate": 0.0001428872692081038, + "loss": 0.8348, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.4781353200177167, + "learning_rate": 0.00014265296659384956, + "loss": 0.8636, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.4993768599320977, + "learning_rate": 0.00014241837740116132, + "loss": 0.8997, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4531237317735709, + "learning_rate": 0.00014218350320620624, + "loss": 0.6856, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.470354206776997, + "learning_rate": 0.00014194834558706632, + "loss": 0.9099, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5095269991795404, + "learning_rate": 0.0001417129061237278, + "loss": 0.8303, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.5883162584844065, + "learning_rate": 0.0001414771863980707, + "loss": 0.9061, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.439843653168383, + "learning_rate": 0.00014124118799385796, + "loss": 0.7372, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.43366569264201593, + "learning_rate": 0.00014100491249672498, + "loss": 0.7139, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.521527189637558, + "learning_rate": 0.00014076836149416887, + "loss": 0.7783, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.3994649107251338, + "learning_rate": 0.0001405315365755379, + "loss": 0.7339, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5860923480126341, + "learning_rate": 0.0001402944393320206, + "loss": 0.9247, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.49501107106674097, + "learning_rate": 0.00014005707135663527, + "loss": 0.7976, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.5062371266281463, + "learning_rate": 0.00013981943424421932, + "loss": 0.7824, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.4713455662305919, + "learning_rate": 0.00013958152959141825, + "loss": 0.7888, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.46974333470995056, + "learning_rate": 0.00013934335899667527, + "loss": 0.817, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.5424012949016316, + "learning_rate": 0.00013910492406022033, + "loss": 0.7656, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.445227479307242, + "learning_rate": 0.00013886622638405952, + "loss": 0.7223, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.5655662279610364, + "learning_rate": 0.0001386272675719642, + "loss": 0.8941, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.5181512905899659, + "learning_rate": 0.00013838804922946027, + "loss": 0.7798, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.4944307879552544, + "learning_rate": 0.00013814857296381728, + "loss": 0.8009, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4431718654723363, + "learning_rate": 0.00013790884038403795, + "loss": 0.6793, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.47978384089329723, + "learning_rate": 0.00013766885310084688, + "loss": 0.8717, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.41275828014932414, + "learning_rate": 0.00013742861272668012, + "loss": 0.7453, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.40167635056969075, + "learning_rate": 0.00013718812087567414, + "loss": 0.7552, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.4768311161085942, + "learning_rate": 0.00013694737916365517, + "loss": 0.7477, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.5969868558835713, + "learning_rate": 0.000136706389208128, + "loss": 0.8123, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.5509063771489698, + "learning_rate": 0.00013646515262826552, + "loss": 0.8173, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.7314088969666789, + "learning_rate": 0.00013622367104489756, + "loss": 0.8875, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.6703491668087395, + "learning_rate": 0.0001359819460805001, + "loss": 0.743, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.4617839957159876, + "learning_rate": 0.0001357399793591844, + "loss": 0.7303, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5592130260304664, + "learning_rate": 0.0001354977725066859, + "loss": 0.8657, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.5487572328437771, + "learning_rate": 0.00013525532715035366, + "loss": 0.8064, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.5293588109138585, + "learning_rate": 0.00013501264491913906, + "loss": 0.8289, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.44982002722735015, + "learning_rate": 0.00013476972744358507, + "loss": 0.7077, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.7512862270318826, + "learning_rate": 0.0001345265763558152, + "loss": 1.1271, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.5766551507517041, + "learning_rate": 0.00013428319328952253, + "loss": 0.842, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.47528773107321093, + "learning_rate": 0.00013403957987995882, + "loss": 0.7522, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.5058145440794231, + "learning_rate": 0.0001337957377639235, + "loss": 0.8256, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4909749501965408, + "learning_rate": 0.0001335516685797525, + "loss": 0.8331, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.5194354145101123, + "learning_rate": 0.0001333073739673076, + "loss": 0.7576, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.46875827617231014, + "learning_rate": 0.00013306285556796495, + "loss": 0.8161, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.5693573690198975, + "learning_rate": 0.0001328181150246045, + "loss": 0.9164, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.46935357691176377, + "learning_rate": 0.00013257315398159864, + "loss": 0.7485, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.5948283196167345, + "learning_rate": 0.00013232797408480127, + "loss": 0.902, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.4766212239023237, + "learning_rate": 0.00013208257698153677, + "loss": 0.7893, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.607765090026707, + "learning_rate": 0.00013183696432058888, + "loss": 0.9624, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.5326289521595492, + "learning_rate": 0.00013159113775218964, + "loss": 0.8394, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.6048705341164726, + "learning_rate": 0.00013134509892800822, + "loss": 0.8717, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4825858005691562, + "learning_rate": 0.00013109884950114007, + "loss": 0.79, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.5709289382929376, + "learning_rate": 0.00013085239112609547, + "loss": 0.9274, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4588906625153007, + "learning_rate": 0.00013060572545878875, + "loss": 0.8087, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.5509718129764554, + "learning_rate": 0.00013035885415652685, + "loss": 0.8854, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.7697820546447985, + "learning_rate": 0.00013011177887799845, + "loss": 0.8984, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.5002318272586276, + "learning_rate": 0.00012986450128326266, + "loss": 0.8744, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5419619630601956, + "learning_rate": 0.00012961702303373795, + "loss": 0.8159, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.47735292588209827, + "learning_rate": 0.00012936934579219094, + "loss": 0.8128, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.5990287239162032, + "learning_rate": 0.00012912147122272523, + "loss": 0.8258, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.49600692051562495, + "learning_rate": 0.00012887340099077024, + "loss": 0.8125, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.6293896630873944, + "learning_rate": 0.00012862513676307008, + "loss": 0.9108, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.4947857180649281, + "learning_rate": 0.0001283766802076722, + "loss": 0.8444, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.5639879587324572, + "learning_rate": 0.00012812803299391628, + "loss": 0.8394, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.5397476844949073, + "learning_rate": 0.00012787919679242306, + "loss": 0.8269, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4232618190448643, + "learning_rate": 0.00012763017327508305, + "loss": 0.7834, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.49956002834638136, + "learning_rate": 0.00012738096411504522, + "loss": 0.7525, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.6187204256102857, + "learning_rate": 0.0001271315709867059, + "loss": 0.8394, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.4215636842122824, + "learning_rate": 0.00012688199556569753, + "loss": 0.768, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.45879407465937844, + "learning_rate": 0.00012663223952887723, + "loss": 0.8202, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.43491023553817093, + "learning_rate": 0.0001263823045543158, + "loss": 0.682, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.5457799956740407, + "learning_rate": 0.00012613219232128608, + "loss": 0.8866, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.5041001973150158, + "learning_rate": 0.00012588190451025207, + "loss": 0.8291, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4610663851280987, + "learning_rate": 0.00012563144280285741, + "loss": 0.7835, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.49938601271838723, + "learning_rate": 0.00012538080888191408, + "loss": 0.7862, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.4424713587817567, + "learning_rate": 0.00012513000443139112, + "loss": 0.79, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.5091447721592515, + "learning_rate": 0.00012487903113640337, + "loss": 0.7629, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.7851193524554307, + "learning_rate": 0.00012462789068320017, + "loss": 0.9942, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.4627528445492815, + "learning_rate": 0.00012437658475915377, + "loss": 0.8194, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.5108947689777327, + "learning_rate": 0.00012412511505274844, + "loss": 0.8145, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.4296843232450707, + "learning_rate": 0.00012387348325356874, + "loss": 0.7779, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.5198850742422576, + "learning_rate": 0.00012362169105228826, + "loss": 0.7927, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.44985237069445677, + "learning_rate": 0.00012336974014065844, + "loss": 0.7671, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.44158884644247104, + "learning_rate": 0.000123117632211497, + "loss": 0.6889, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.4265756896029439, + "learning_rate": 0.00012286536895867654, + "loss": 0.739, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.5390316290711439, + "learning_rate": 0.00012261295207711346, + "loss": 0.8385, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.73606467368334, + "learning_rate": 0.00012236038326275626, + "loss": 0.9169, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.5547510346684561, + "learning_rate": 0.0001221076642125742, + "loss": 0.8029, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.6157708117641697, + "learning_rate": 0.00012185479662454595, + "loss": 0.8162, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.4592158639221032, + "learning_rate": 0.00012160178219764837, + "loss": 0.8314, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.5301100515025033, + "learning_rate": 0.00012134862263184467, + "loss": 0.834, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.48034266500540934, + "learning_rate": 0.00012109531962807332, + "loss": 0.8387, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.5807002869606578, + "learning_rate": 0.00012084187488823657, + "loss": 0.806, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.48886470315743324, + "learning_rate": 0.00012058829011518896, + "loss": 0.8138, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.6142185112064632, + "learning_rate": 0.00012033456701272576, + "loss": 0.8387, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.4425976264319214, + "learning_rate": 0.00012008070728557186, + "loss": 0.8137, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.46162394263837786, + "learning_rate": 0.00011982671263936995, + "loss": 0.7112, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.5274038551293228, + "learning_rate": 0.00011957258478066931, + "loss": 0.8424, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.5630999009119136, + "learning_rate": 0.00011931832541691418, + "loss": 0.8762, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.5849020792876066, + "learning_rate": 0.00011906393625643244, + "loss": 0.7851, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.4198420126887951, + "learning_rate": 0.00011880941900842397, + "loss": 0.7423, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5912880602623347, + "learning_rate": 0.00011855477538294935, + "loss": 0.7853, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.39988609255560875, + "learning_rate": 0.00011830000709091815, + "loss": 0.7304, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.40463911888659115, + "learning_rate": 0.00011804511584407763, + "loss": 0.7231, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.4796130771098071, + "learning_rate": 0.0001177901033550012, + "loss": 0.8143, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.6433056888849106, + "learning_rate": 0.00011753497133707679, + "loss": 1.0016, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.4127734266327962, + "learning_rate": 0.00011727972150449544, + "loss": 0.7422, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.44914437089719506, + "learning_rate": 0.00011702435557223987, + "loss": 0.7655, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.6011966541551302, + "learning_rate": 0.00011676887525607271, + "loss": 0.8456, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.3849250338300349, + "learning_rate": 0.00011651328227252517, + "loss": 0.6709, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.5141797468340683, + "learning_rate": 0.00011625757833888551, + "loss": 0.8416, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.445264622635813, + "learning_rate": 0.00011600176517318741, + "loss": 0.7007, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.5825263970937105, + "learning_rate": 0.0001157458444941984, + "loss": 0.8128, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4971690811475642, + "learning_rate": 0.00011548981802140848, + "loss": 0.7037, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.4421032507562734, + "learning_rate": 0.00011523368747501839, + "loss": 0.7006, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4505115695438612, + "learning_rate": 0.00011497745457592816, + "loss": 0.7256, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.5301309168789724, + "learning_rate": 0.00011472112104572547, + "loss": 0.8689, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.6159699802495321, + "learning_rate": 0.00011446468860667421, + "loss": 0.9241, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.48592950799355167, + "learning_rate": 0.0001142081589817027, + "loss": 0.7434, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.47418559732631554, + "learning_rate": 0.00011395153389439233, + "loss": 0.7166, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.41718431814920304, + "learning_rate": 0.00011369481506896582, + "loss": 0.7668, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4155365826767906, + "learning_rate": 0.00011343800423027582, + "loss": 0.7643, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.4400801636486908, + "learning_rate": 0.00011318110310379301, + "loss": 0.7299, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.44501033272795804, + "learning_rate": 0.0001129241134155949, + "loss": 0.6985, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.5296054918048602, + "learning_rate": 0.00011266703689235394, + "loss": 0.7756, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.7416615624398768, + "learning_rate": 0.00011240987526132594, + "loss": 0.9492, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.5162215630640894, + "learning_rate": 0.00011215263025033869, + "loss": 0.8065, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.5324339114918692, + "learning_rate": 0.00011189530358778005, + "loss": 0.769, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.6489662996797839, + "learning_rate": 0.00011163789700258655, + "loss": 0.8399, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.5316210959628527, + "learning_rate": 0.00011138041222423177, + "loss": 0.8054, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.5450723157158529, + "learning_rate": 0.00011112285098271451, + "loss": 0.7204, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.6653921394791508, + "learning_rate": 0.00011086521500854745, + "loss": 0.952, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.42758237423704815, + "learning_rate": 0.00011060750603274535, + "loss": 0.7746, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.37970567907221614, + "learning_rate": 0.00011034972578681338, + "loss": 0.6783, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.5306101545577548, + "learning_rate": 0.00011009187600273566, + "loss": 0.7251, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.4879322277705361, + "learning_rate": 0.00010983395841296348, + "loss": 0.7993, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.5860036003574343, + "learning_rate": 0.00010957597475040373, + "loss": 0.8047, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5409230231224406, + "learning_rate": 0.00010931792674840718, + "loss": 0.914, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.5180729089295505, + "learning_rate": 0.00010905981614075693, + "loss": 0.712, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.4638419079807667, + "learning_rate": 0.00010880164466165674, + "loss": 0.7917, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.498367616853681, + "learning_rate": 0.00010854341404571928, + "loss": 0.7935, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5020078746326063, + "learning_rate": 0.00010828512602795462, + "loss": 0.8474, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.49792991575536666, + "learning_rate": 0.00010802678234375851, + "loss": 0.7891, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.42927436111692624, + "learning_rate": 0.00010776838472890065, + "loss": 0.7354, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.6189941094998067, + "learning_rate": 0.0001075099349195131, + "loss": 0.9778, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.535158019745401, + "learning_rate": 0.00010725143465207867, + "loss": 0.8887, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.5421336286292466, + "learning_rate": 0.00010699288566341914, + "loss": 0.726, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.4774234587003961, + "learning_rate": 0.00010673428969068364, + "loss": 0.8258, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.4393486984262681, + "learning_rate": 0.000106475648471337, + "loss": 0.8249, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.5168133456503567, + "learning_rate": 0.00010621696374314807, + "loss": 0.7769, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.4805966027030343, + "learning_rate": 0.00010595823724417795, + "loss": 0.7747, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.36572368436725333, + "learning_rate": 0.00010569947071276847, + "loss": 0.712, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.4446834682288979, + "learning_rate": 0.00010544066588753044, + "loss": 0.7308, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5556975393599263, + "learning_rate": 0.00010518182450733186, + "loss": 0.7701, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.4251983124162712, + "learning_rate": 0.00010492294831128641, + "loss": 0.7161, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.6331971990969477, + "learning_rate": 0.00010466403903874176, + "loss": 0.88, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.5519027354901856, + "learning_rate": 0.00010440509842926767, + "loss": 0.7777, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.423190616566324, + "learning_rate": 0.00010414612822264455, + "loss": 0.6877, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.5503702795446749, + "learning_rate": 0.00010388713015885161, + "loss": 0.8817, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.5388373266987113, + "learning_rate": 0.00010362810597805526, + "loss": 0.8474, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.6012561415901527, + "learning_rate": 0.00010336905742059742, + "loss": 0.8188, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.6950778812475392, + "learning_rate": 0.0001031099862269837, + "loss": 0.9335, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.5187122891657171, + "learning_rate": 0.0001028508941378719, + "loss": 0.801, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.5667565102338985, + "learning_rate": 0.00010259178289406011, + "loss": 0.8518, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.44921171437003526, + "learning_rate": 0.00010233265423647523, + "loss": 0.727, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.44041090396110116, + "learning_rate": 0.00010207350990616107, + "loss": 0.7409, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.4635285462260514, + "learning_rate": 0.00010181435164426676, + "loss": 0.8113, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.4351640872935251, + "learning_rate": 0.0001015551811920351, + "loss": 0.684, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.7346651886825838, + "learning_rate": 0.00010129600029079072, + "loss": 0.8041, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.43179688684376, + "learning_rate": 0.00010103681068192845, + "loss": 0.7608, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.44408111726364724, + "learning_rate": 0.00010077761410690172, + "loss": 0.6759, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.48160792171232253, + "learning_rate": 0.00010051841230721065, + "loss": 0.7908, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.6539498144594257, + "learning_rate": 0.00010025920702439051, + "loss": 0.8314, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.44770276471829223, + "learning_rate": 0.0001, + "loss": 0.7071, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.5152156674652172, + "learning_rate": 9.97407929756095e-05, + "loss": 0.8245, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.624011380930772, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7836, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.56835724540912, + "learning_rate": 9.92223858930983e-05, + "loss": 0.7877, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4897329961960285, + "learning_rate": 9.896318931807155e-05, + "loss": 0.8135, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.49578090759811055, + "learning_rate": 9.870399970920932e-05, + "loss": 0.8754, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.48731142631830265, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7789, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.5673459149137637, + "learning_rate": 9.818564835573323e-05, + "loss": 0.8433, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6188075664323361, + "learning_rate": 9.792649009383899e-05, + "loss": 0.8938, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.46873132471454837, + "learning_rate": 9.766734576352478e-05, + "loss": 0.8145, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.586928798556376, + "learning_rate": 9.740821710593989e-05, + "loss": 0.8484, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.5255526240068161, + "learning_rate": 9.714910586212816e-05, + "loss": 0.7714, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.5059476711446602, + "learning_rate": 9.689001377301633e-05, + "loss": 0.8066, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.49582010272907506, + "learning_rate": 9.663094257940258e-05, + "loss": 0.7973, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4372289417856676, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6713, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.5851419845977731, + "learning_rate": 9.611286984114841e-05, + "loss": 0.8903, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.6412073563576178, + "learning_rate": 9.585387177735547e-05, + "loss": 0.8629, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.5336461120146851, + "learning_rate": 9.559490157073236e-05, + "loss": 0.7686, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.5473242230872678, + "learning_rate": 9.533596096125825e-05, + "loss": 0.8852, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.4835820367008371, + "learning_rate": 9.507705168871358e-05, + "loss": 0.7318, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5887152325895428, + "learning_rate": 9.481817549266817e-05, + "loss": 0.888, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.4813882793581241, + "learning_rate": 9.455933411246958e-05, + "loss": 0.8032, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.5416113889145342, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7934, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.4783917145856643, + "learning_rate": 9.404176275582208e-05, + "loss": 0.7595, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.41668235251938335, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7914, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.505696911444653, + "learning_rate": 9.352435152866298e-05, + "loss": 0.722, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.42344148579580215, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7278, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.5015991421348351, + "learning_rate": 9.300711433658087e-05, + "loss": 0.869, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4885835248231759, + "learning_rate": 9.274856534792138e-05, + "loss": 0.8106, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.4210617527552405, + "learning_rate": 9.249006508048694e-05, + "loss": 0.7295, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.4563666120496189, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7539, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.6281822494421835, + "learning_rate": 9.197321765624152e-05, + "loss": 0.8547, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5227123231829046, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7135, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.5436077236107482, + "learning_rate": 9.145658595428074e-05, + "loss": 0.8205, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.6576452598843193, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7732, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.5460176922167431, + "learning_rate": 9.09401838592431e-05, + "loss": 0.8402, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.5169191615165644, + "learning_rate": 9.068207325159284e-05, + "loss": 0.715, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.4735339701945484, + "learning_rate": 9.04240252495963e-05, + "loss": 0.797, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.583929608657358, + "learning_rate": 9.016604158703654e-05, + "loss": 0.8244, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.5857100895089427, + "learning_rate": 8.990812399726435e-05, + "loss": 0.8555, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4287227609471326, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7004, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.5591560903597864, + "learning_rate": 8.939249396725467e-05, + "loss": 0.8802, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.5709463423227509, + "learning_rate": 8.913478499145254e-05, + "loss": 0.9058, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.47814781051246497, + "learning_rate": 8.887714901728551e-05, + "loss": 0.7459, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3991936136993612, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6433, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.6474370255263097, + "learning_rate": 8.836210299741346e-05, + "loss": 0.9395, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.47354372366850817, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7087, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.5204739940092096, + "learning_rate": 8.784736974966135e-05, + "loss": 0.7485, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4024037903604234, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6583, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.4328957613797475, + "learning_rate": 8.733296310764611e-05, + "loss": 0.7733, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.5988603496036152, + "learning_rate": 8.707588658440511e-05, + "loss": 0.8826, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.5959098740282872, + "learning_rate": 8.6818896896207e-05, + "loss": 0.7896, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5481226538243807, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7953, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.4418655243673763, + "learning_rate": 8.63051849310342e-05, + "loss": 0.7368, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.5465076665795701, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7109, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.43624854053643497, + "learning_rate": 8.579184101829734e-05, + "loss": 0.7559, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.5818695364312464, + "learning_rate": 8.553531139332582e-05, + "loss": 0.857, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.5649071509926393, + "learning_rate": 8.527887895427454e-05, + "loss": 0.7932, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.5352732305024815, + "learning_rate": 8.502254542407186e-05, + "loss": 0.801, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.5599173239277646, + "learning_rate": 8.476631252498162e-05, + "loss": 0.8714, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4900154560625499, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7859, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.5502231586252436, + "learning_rate": 8.425415550580162e-05, + "loss": 0.8628, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.6632543823507182, + "learning_rate": 8.399823482681262e-05, + "loss": 0.8263, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.5520641901799846, + "learning_rate": 8.374242166111448e-05, + "loss": 0.8177, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.513119056715211, + "learning_rate": 8.348671772747487e-05, + "loss": 0.773, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.5116341203636527, + "learning_rate": 8.323112474392731e-05, + "loss": 0.7843, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.4503758921300252, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7573, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.4900627207577956, + "learning_rate": 8.272027849550457e-05, + "loss": 0.7897, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4703910310994137, + "learning_rate": 8.246502866292324e-05, + "loss": 0.8865, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.5531414201362082, + "learning_rate": 8.220989664499878e-05, + "loss": 0.8001, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4457053862521107, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7449, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.45835200222634626, + "learning_rate": 8.169999290908188e-05, + "loss": 0.7697, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5391822751157387, + "learning_rate": 8.144522461705067e-05, + "loss": 0.9577, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.41121558299077177, + "learning_rate": 8.119058099157604e-05, + "loss": 0.698, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5643814352461478, + "learning_rate": 8.093606374356759e-05, + "loss": 0.735, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.5568828460124492, + "learning_rate": 8.068167458308582e-05, + "loss": 0.9282, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.5539915368104986, + "learning_rate": 8.042741521933071e-05, + "loss": 0.8599, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.45352224561414184, + "learning_rate": 8.017328736063006e-05, + "loss": 0.7158, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.517710231897813, + "learning_rate": 7.991929271442817e-05, + "loss": 0.895, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.4794346129983727, + "learning_rate": 7.966543298727425e-05, + "loss": 0.6596, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5452911706754874, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7885, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.43227433392495757, + "learning_rate": 7.915812511176347e-05, + "loss": 0.744, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.5783187900232767, + "learning_rate": 7.89046803719267e-05, + "loss": 0.9126, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.6757420671981376, + "learning_rate": 7.865137736815535e-05, + "loss": 0.8696, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5289380766238764, + "learning_rate": 7.839821780235168e-05, + "loss": 0.8126, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.45917046745303486, + "learning_rate": 7.814520337545406e-05, + "loss": 0.7065, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.5996754384273839, + "learning_rate": 7.789233578742582e-05, + "loss": 0.8266, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.4390117214376881, + "learning_rate": 7.763961673724379e-05, + "loss": 0.6859, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.5049988200186094, + "learning_rate": 7.738704792288655e-05, + "loss": 0.8494, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.5542254634032855, + "learning_rate": 7.713463104132345e-05, + "loss": 0.7942, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.471479053679602, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7559, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.5412297686164421, + "learning_rate": 7.663025985934158e-05, + "loss": 0.8575, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.47960063289980687, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7282, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.41780012519923804, + "learning_rate": 7.61265167464313e-05, + "loss": 0.7839, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.5202860281173485, + "learning_rate": 7.587488494725157e-05, + "loss": 0.8104, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.769126208089339, + "learning_rate": 7.562341524084623e-05, + "loss": 0.8103, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.5996068989135162, + "learning_rate": 7.537210931679987e-05, + "loss": 0.8862, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.5094074554830536, + "learning_rate": 7.512096886359664e-05, + "loss": 0.7985, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.5087296490744276, + "learning_rate": 7.48699955686089e-05, + "loss": 0.8255, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.587661220344711, + "learning_rate": 7.461919111808595e-05, + "loss": 0.8471, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.5185898319881128, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7476, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.544469500243794, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8783, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.5254123005813613, + "learning_rate": 7.386780767871397e-05, + "loss": 0.8087, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.4406116146332978, + "learning_rate": 7.361769544568425e-05, + "loss": 0.7286, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.5028576842819465, + "learning_rate": 7.336776047112276e-05, + "loss": 0.733, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.41551135895499197, + "learning_rate": 7.311800443430251e-05, + "loss": 0.6644, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.4696895218060643, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7187, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.41672726956612643, + "learning_rate": 7.26190358849548e-05, + "loss": 0.7584, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.41873577517602656, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7304, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.4696259863617647, + "learning_rate": 7.212080320757695e-05, + "loss": 0.8508, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.6143166042160803, + "learning_rate": 7.187196700608373e-05, + "loss": 0.9485, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.4613086902582385, + "learning_rate": 7.162331979232783e-05, + "loss": 0.7219, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.39347765856491007, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6624, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.725989550678472, + "learning_rate": 7.112659900922976e-05, + "loss": 0.7544, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5415172592089366, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7987, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.5502217968259229, + "learning_rate": 7.06306542078091e-05, + "loss": 0.7951, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.3424383843607733, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6563, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.5428890927439577, + "learning_rate": 7.013549871673736e-05, + "loss": 0.8372, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.4765435615003908, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7171, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.44134979864312174, + "learning_rate": 6.964114584347316e-05, + "loss": 0.6679, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.6268717622111956, + "learning_rate": 6.939427454121128e-05, + "loss": 0.9558, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.5844528780224308, + "learning_rate": 6.914760887390452e-05, + "loss": 0.7697, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.5941143028746608, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7959, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.46393220370096555, + "learning_rate": 6.865490107199181e-05, + "loss": 0.7649, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4765025939352185, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7156, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.542382071248004, + "learning_rate": 6.816303567941112e-05, + "loss": 0.7514, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.5272328764131159, + "learning_rate": 6.791742301846326e-05, + "loss": 0.8734, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.49204765303119435, + "learning_rate": 6.767202591519875e-05, + "loss": 0.7192, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.5490594184428006, + "learning_rate": 6.742684601840141e-05, + "loss": 0.8106, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.4904484145501484, + "learning_rate": 6.718188497539554e-05, + "loss": 0.7916, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.67000629408774, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7784, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.6480219038042772, + "learning_rate": 6.669262603269246e-05, + "loss": 0.7911, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.6361964397959794, + "learning_rate": 6.644833142024751e-05, + "loss": 0.8337, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.5253640742734401, + "learning_rate": 6.620426223607654e-05, + "loss": 0.754, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.5162740009828527, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7588, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.5134935237825705, + "learning_rate": 6.571680671047749e-05, + "loss": 0.7404, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.5766140612672064, + "learning_rate": 6.547342364418481e-05, + "loss": 0.9184, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.4663832143024057, + "learning_rate": 6.523027255641493e-05, + "loss": 0.762, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.48911355189488026, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7633, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.6694129803671309, + "learning_rate": 6.474467284964634e-05, + "loss": 0.8236, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5189762766415502, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7676, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.4161464231880787, + "learning_rate": 6.426002064081565e-05, + "loss": 0.6936, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4984293204783021, + "learning_rate": 6.40180539194999e-05, + "loss": 0.8314, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.39903904223045994, + "learning_rate": 6.377632895510248e-05, + "loss": 0.7616, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.43805121875966846, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6689, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.4895082281505452, + "learning_rate": 6.329361079187199e-05, + "loss": 0.7549, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.46576964895451856, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7616, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.4899997620883344, + "learning_rate": 6.281187912432587e-05, + "loss": 0.7636, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4278249266877664, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6874, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.4422267158793512, + "learning_rate": 6.233114689915316e-05, + "loss": 0.7535, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.592090602720976, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7782, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.6157235118768961, + "learning_rate": 6.18514270361827e-05, + "loss": 0.8226, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.6727932260245815, + "learning_rate": 6.161195077053976e-05, + "loss": 0.8099, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.5703922364388165, + "learning_rate": 6.13727324280358e-05, + "loss": 0.7134, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4149199530047408, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7523, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.42823391157851937, + "learning_rate": 6.08950759397797e-05, + "loss": 0.6743, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.9501696638008198, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7228, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.46931384274994215, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.7168, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.44711369177258536, + "learning_rate": 6.018056575578075e-05, + "loss": 0.8109, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.44288117617273876, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.7881, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.6499248586366762, + "learning_rate": 5.970556066797941e-05, + "loss": 0.8619, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.41703322618341265, + "learning_rate": 5.946846342446214e-05, + "loss": 0.7083, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.5095449365168471, + "learning_rate": 5.923163850583113e-05, + "loss": 0.756, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.41427632543743154, + "learning_rate": 5.899508750327501e-05, + "loss": 0.7486, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.6832171979501523, + "learning_rate": 5.875881200614207e-05, + "loss": 0.9427, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.44443690827639093, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.7332, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.48159945658139397, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7668, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.4923218317336922, + "learning_rate": 5.80516544129337e-05, + "loss": 0.7557, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3999444938598147, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6873, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.5202580334267757, + "learning_rate": 5.758162259883867e-05, + "loss": 0.7289, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.526995723483989, + "learning_rate": 5.73470334061505e-05, + "loss": 0.8378, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.5259009754185895, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.8784, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4148487564467498, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7354, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.3892718151581878, + "learning_rate": 5.664499159372017e-05, + "loss": 0.65, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5008382317275297, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7473, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.5651862447253267, + "learning_rate": 5.617841757494762e-05, + "loss": 0.8362, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.46058756838487114, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6934, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.4993820774612727, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.7971, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.5434804356503512, + "learning_rate": 5.54807686792933e-05, + "loss": 0.798, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.43500365960349463, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6257, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.49070277501854065, + "learning_rate": 5.501716239923642e-05, + "loss": 0.8306, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.5631963321595915, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.8129, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.5609443791431765, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7962, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.3791295673912737, + "learning_rate": 5.432402360355615e-05, + "loss": 0.7109, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.5859099014370488, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.9052, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.37561326410012924, + "learning_rate": 5.386346293357242e-05, + "loss": 0.6309, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.5266492046904344, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7868, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.5867416216649431, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.8795, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4240560451003615, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6787, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.6062473435601824, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.8306, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4980079779028096, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7298, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.4661762480561208, + "learning_rate": 5.248926987065417e-05, + "loss": 0.7258, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.5948880473411658, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7926, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.48975084576834266, + "learning_rate": 5.203374286747158e-05, + "loss": 0.7845, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.5167217184044314, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7723, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.5239428466594733, + "learning_rate": 5.15795049724435e-05, + "loss": 0.7706, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.5806756500414043, + "learning_rate": 5.135287325678271e-05, + "loss": 0.8032, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.4596292999499451, + "learning_rate": 5.112656839335543e-05, + "loss": 0.7199, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.5596227848306364, + "learning_rate": 5.090059190266779e-05, + "loss": 0.8312, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.5591963136625346, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.8315, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4449170323794898, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6475, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.47705498742937347, + "learning_rate": 5.022464783894744e-05, + "loss": 0.8322, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.4786649964050982, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7813, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.7301904758939219, + "learning_rate": 4.977568810302432e-05, + "loss": 0.8948, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.43763461326643843, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6743, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.4314007227657756, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.7339, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.4134202505028603, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6809, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.5127035873811302, + "learning_rate": 4.88818300430819e-05, + "loss": 0.8072, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5157247141627943, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6914, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.5538438779175323, + "learning_rate": 4.843695574177737e-05, + "loss": 0.853, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.5693373524521677, + "learning_rate": 4.821503751016746e-05, + "loss": 0.82, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.613013512189452, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.8581, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5782273687721763, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7864, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.4832023140562939, + "learning_rate": 4.755137637685979e-05, + "loss": 0.7596, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.5893373654741814, + "learning_rate": 4.733085880741301e-05, + "loss": 0.8659, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.5728356489192235, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.7982, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.4963888013123301, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7811, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.4593544644255751, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.7239, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.5292808478828301, + "learning_rate": 4.645234206515171e-05, + "loss": 0.777, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.47716264961011856, + "learning_rate": 4.623360864173893e-05, + "loss": 0.7799, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.48102047219338884, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7952, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.5930481959978581, + "learning_rate": 4.579722700537268e-05, + "loss": 0.7012, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.4680186213844873, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7274, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.481882307486294, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.6557, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5636609433316219, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7124, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.42374086491731305, + "learning_rate": 4.492884557078688e-05, + "loss": 0.6853, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.5122385038575532, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7265, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.5590270489108338, + "learning_rate": 4.449686911058992e-05, + "loss": 0.7621, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4817673474442853, + "learning_rate": 4.428143953045717e-05, + "loss": 0.748, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.44675247122412026, + "learning_rate": 4.406638431438576e-05, + "loss": 0.7411, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.42361063787076775, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6925, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.41144769791298424, + "learning_rate": 4.36374027515878e-05, + "loss": 0.6548, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.47171016221244494, + "learning_rate": 4.342347928711953e-05, + "loss": 0.703, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.6101698325485143, + "learning_rate": 4.320993595120969e-05, + "loss": 0.8228, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.5041520888592116, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.7677, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.7814869692964698, + "learning_rate": 4.278399540155536e-05, + "loss": 0.683, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.46844930916310995, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7473, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.5959483088682741, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.6458, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.5063605399129267, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7905, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.3658666952668689, + "learning_rate": 4.193673880223339e-05, + "loss": 0.6757, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.4564275633578927, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7749, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.5076235699518854, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.7292, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.46941124958967184, + "learning_rate": 4.130538759866457e-05, + "loss": 0.8078, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.42706239756817554, + "learning_rate": 4.109572403415386e-05, + "loss": 0.7113, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.6161008203576525, + "learning_rate": 4.088645623801534e-05, + "loss": 0.787, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.8534797844851879, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.8427, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.5520642001035455, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6774, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.4996239606791152, + "learning_rate": 4.026104150684835e-05, + "loss": 0.7721, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.4880290230071659, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7185, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.37064325461692554, + "learning_rate": 3.984610290059467e-05, + "loss": 0.7046, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5034607956958921, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7567, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.5465610376232676, + "learning_rate": 3.943278094912946e-05, + "loss": 0.7808, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4809947903038082, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7492, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.44723047852010356, + "learning_rate": 3.902108676060937e-05, + "loss": 0.6718, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.4658776704784212, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7193, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.5914359748097043, + "learning_rate": 3.861103139944449e-05, + "loss": 0.8102, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.489289904908363, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7612, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.576337648589474, + "learning_rate": 3.820262588600074e-05, + "loss": 0.8564, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.5813448675355454, + "learning_rate": 3.79990452539225e-05, + "loss": 0.8075, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.5253527903268184, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.7886, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.6103612355242102, + "learning_rate": 3.759313507817196e-05, + "loss": 0.9397, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.4577395100753244, + "learning_rate": 3.739080826174498e-05, + "loss": 0.7448, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4825704700026192, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.8243, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.5219110921451579, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.7542, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.7871936069186822, + "learning_rate": 3.678635720256737e-05, + "loss": 0.8077, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.47315481292649386, + "learning_rate": 3.658572115866541e-05, + "loss": 0.746, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.4748880749384918, + "learning_rate": 3.638551118512089e-05, + "loss": 0.8369, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.4080798085611525, + "learning_rate": 3.618572862711247e-05, + "loss": 0.6531, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.5119253583646589, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7468, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.6212700792170417, + "learning_rate": 3.578745112405083e-05, + "loss": 0.762, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.5282916644005026, + "learning_rate": 3.558895885496023e-05, + "loss": 0.8204, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.5071041999449878, + "learning_rate": 3.539089935331294e-05, + "loss": 0.8328, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4543491551374726, + "learning_rate": 3.519327394983888e-05, + "loss": 0.676, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.4303808812872645, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.6649, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4145040254576927, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6757, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.46752384609791203, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.8547, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.5248016141240613, + "learning_rate": 3.440713983000601e-05, + "loss": 0.8223, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.40006990522502706, + "learning_rate": 3.421170477595419e-05, + "loss": 0.6361, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.41418076269554743, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7049, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.4584166617163971, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.7265, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5444286078527143, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7774, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.45218769533175474, + "learning_rate": 3.34343978560367e-05, + "loss": 0.8481, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.4864396689150151, + "learning_rate": 3.324118597838464e-05, + "loss": 0.8067, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.44434463233048926, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.6853, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.41700659337429324, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6749, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.5993802512462391, + "learning_rate": 3.266424677350346e-05, + "loss": 0.7137, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.44931539515794827, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6867, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.45909105181412047, + "learning_rate": 3.228188057393895e-05, + "loss": 0.759, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.681985951504881, + "learning_rate": 3.209137931341143e-05, + "loss": 0.8368, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.8366986354033052, + "learning_rate": 3.190133432000252e-05, + "loss": 0.8576, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4921233294818476, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7727, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.4771865298525171, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.7724, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.527380279188646, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7268, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.3751049915807199, + "learning_rate": 3.114574250902558e-05, + "loss": 0.6258, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.5285171071742202, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6757, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.5029451702331986, + "learning_rate": 3.077071725875116e-05, + "loss": 0.7242, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.5881607167547737, + "learning_rate": 3.058390171511196e-05, + "loss": 0.7358, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.44888944202160613, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.7542, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.5517416672102848, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7485, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.5063472555628153, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.6773, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4078174656869059, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.793, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.6174211087839312, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.8971, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4797403859123356, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.7631, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.41077179789727625, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7006, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4511949612040704, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.711, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.42043055532718043, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.6621, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.5378329077146456, + "learning_rate": 2.874160358524931e-05, + "loss": 0.9106, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.44452033249967243, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.6982, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.5084625352483807, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7144, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.46768349941192616, + "learning_rate": 2.819819423336775e-05, + "loss": 0.7016, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.47694367327979464, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.7479, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.4765841625519169, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.7587, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.5965387934471909, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7642, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.456663109129941, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.6274, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.5917085485189117, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7109, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.4264667225940256, + "learning_rate": 2.712443353799984e-05, + "loss": 0.682, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.49031682812728306, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.8333, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.5445763973198763, + "learning_rate": 2.677041764010988e-05, + "loss": 0.588, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5167494042052067, + "learning_rate": 2.659414712405398e-05, + "loss": 0.7588, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.4109214999293328, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.6259, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.46313638523489997, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7309, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.5405431360774156, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.8372, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.46212905775295887, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6678, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.481123042266737, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.6835, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4226882956652361, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6223, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.49527409119365456, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.7526, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.5038293645904581, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.8102, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.46762425327328927, + "learning_rate": 2.503004759861258e-05, + "loss": 0.6715, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4714172170588027, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7877, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.41543249343540894, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.682, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.41678514930683885, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6574, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.5473185055431781, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.841, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.5081509533859854, + "learning_rate": 2.417867893002387e-05, + "loss": 0.7358, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.4717804951268603, + "learning_rate": 2.400992893100822e-05, + "loss": 0.8328, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.5279633817232753, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7876, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.5469059591551831, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.8143, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.48881655675917796, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7419, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.8183661784871328, + "learning_rate": 2.334004587234717e-05, + "loss": 0.9628, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.4443008674328188, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6465, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.491375950496024, + "learning_rate": 2.300819024631603e-05, + "loss": 0.7629, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.38746514211583577, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6489, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.5123092130360583, + "learning_rate": 2.26784037992395e-05, + "loss": 0.8181, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.5640094489556691, + "learning_rate": 2.251428928971102e-05, + "loss": 0.7797, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.4642024801709126, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.8212, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.6556017469175548, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7891, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.4015401241516933, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.6336, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.4381193049202663, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7072, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.4536050378109064, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.6754, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.368242729052764, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.5968, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.5120152908755962, + "learning_rate": 2.138012622361689e-05, + "loss": 0.7909, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.6537452367657269, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.7993, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.44996218961547835, + "learning_rate": 2.106081749751897e-05, + "loss": 0.7202, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.45925587252682554, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7205, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.4784011669048103, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.7557, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.38377975270286485, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6405, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.3738615814125053, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.6211, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.414407263661106, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6923, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.546862686175386, + "learning_rate": 2.011565445123711e-05, + "loss": 0.8074, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4721171141095232, + "learning_rate": 1.995999968955641e-05, + "loss": 0.722, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.5250980188045624, + "learning_rate": 1.980488270378612e-05, + "loss": 0.7206, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.39319681519493027, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6868, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.5298182028077267, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.6903, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.48201347038805087, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7593, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.5282216413510522, + "learning_rate": 1.918981330958678e-05, + "loss": 0.7375, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.38140745448157326, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6859, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.720905548280619, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.7977, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.5943362314470606, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7603, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.44026458461341533, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.6954, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.39168038653597353, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7052, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.5051585049106107, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.803, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.5093371160760573, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6966, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.44492165541172995, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.6336, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4586654762573073, + "learning_rate": 1.783776873795994e-05, + "loss": 0.7204, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.475042880175623, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.6966, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.48692832005893966, + "learning_rate": 1.754336106761927e-05, + "loss": 0.7027, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.6533590380966955, + "learning_rate": 1.739698775823442e-05, + "loss": 0.7991, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.5088543228951543, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6792, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.5365077711844575, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.8042, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.607910990415707, + "learning_rate": 1.696120172352025e-05, + "loss": 0.709, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.543830616087409, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.8347, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.4479815487693433, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6691, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.5575118745140125, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.7793, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4623527812061081, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.691, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.47170960129034645, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.7664, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.40589211723619584, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.695, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.5595609747168507, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.7339, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.331177129924808, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6259, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.42842244367933835, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.6795, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.5303146911759715, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7196, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.6413552273769738, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.8235, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4896679986097937, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6268, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.49353178307848933, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.7713, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.42714742055982385, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7156, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.47129628936070617, + "learning_rate": 1.485810737340767e-05, + "loss": 0.7049, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.5655082901936167, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7203, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.4797701282259538, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.772, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.483579335699633, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.7338, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.4856004197153419, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.7339, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.4844727919392016, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6873, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.4354352674543203, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.6916, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4382216003346178, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6948, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.6770346482301582, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.7415, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.5115812748312439, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.7453, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.40621159507540683, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.6829, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.41199024634843723, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6893, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.49634731217081535, + "learning_rate": 1.326814704364262e-05, + "loss": 0.67, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.4959416286449828, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.732, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.5879885666504012, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.7008, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.5017874717397822, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7023, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.4889156027327041, + "learning_rate": 1.275673273546758e-05, + "loss": 0.6859, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.41541902496673644, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7111, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.47242855088569513, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.7048, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.5848713015965614, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.8022, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.44615163121800333, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.7684, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.54850319105347, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7982, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.5625949181314011, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.7601, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.5050525040384994, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7682, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.7905231526074173, + "learning_rate": 1.176209418012495e-05, + "loss": 0.7779, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4542040021729755, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6861, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.4644381199480783, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.7356, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5818575190161742, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7372, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.5567751820347714, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.7911, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.60641735952812, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.8726, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.5685906303235803, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.7652, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.4818487663254473, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.719, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.5263438363503978, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.7884, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.5688085645171608, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.7585, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.6889185312449415, + "learning_rate": 1.057219974130903e-05, + "loss": 0.8714, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.4431544075081975, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7293, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.4892432936929515, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.7583, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.5363432561645519, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7062, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.48599545437169195, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.7263, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4877492160507757, + "learning_rate": 9.999734793146998e-06, + "loss": 0.8108, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.38602828530404476, + "learning_rate": 9.887052838721322e-06, + "loss": 0.6334, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.44578160607841233, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6839, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.4313393619764014, + "learning_rate": 9.663506046162985e-06, + "loss": 0.6248, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.5766723504134088, + "learning_rate": 9.552642710005299e-06, + "loss": 0.848, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.41382160420692804, + "learning_rate": 9.44238707511862e-06, + "loss": 0.6548, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.5187264244623699, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7366, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.63150061116092, + "learning_rate": 9.22370186822965e-06, + "loss": 0.7477, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.43520756347287115, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6083, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.6954729281581934, + "learning_rate": 9.0074563027294e-06, + "loss": 0.87, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.6523736879654485, + "learning_rate": 8.900250204211514e-06, + "loss": 0.8212, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.4351809267256565, + "learning_rate": 8.79365619028507e-06, + "loss": 0.7426, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.49734748006175955, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7176, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.42863184313110836, + "learning_rate": 8.582307276841462e-06, + "loss": 0.6746, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.39624365515091226, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6346, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.6070474678947206, + "learning_rate": 8.37341524246672e-06, + "loss": 0.8173, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.6152814779816568, + "learning_rate": 8.269892311900696e-06, + "loss": 0.7342, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.4735777668957982, + "learning_rate": 8.166985701199582e-06, + "loss": 0.7085, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.44779780181339546, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7101, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.3927049863563379, + "learning_rate": 7.963024200898462e-06, + "loss": 0.7166, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.44215943404038016, + "learning_rate": 7.861970681683051e-06, + "loss": 0.7443, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.5161580910333338, + "learning_rate": 7.761536223092458e-06, + "loss": 0.7128, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.4904210254954027, + "learning_rate": 7.661721499929753e-06, + "loss": 0.746, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.6310426985660955, + "learning_rate": 7.562527182833978e-06, + "loss": 0.73, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.484804684526844, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7435, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.4031087847091995, + "learning_rate": 7.366002428553153e-06, + "loss": 0.6395, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.48903277198413786, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.639, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.7331550646221433, + "learning_rate": 7.171967241914224e-06, + "loss": 0.814, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4628809485755903, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6781, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.3896616608964118, + "learning_rate": 6.980426837673437e-06, + "loss": 0.6409, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4596513087094253, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6971, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.48381308759059494, + "learning_rate": 6.791386363539065e-06, + "loss": 0.6992, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4567329216654885, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7324, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.5193372823300771, + "learning_rate": 6.604850900032955e-06, + "loss": 0.6888, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.6117060657697789, + "learning_rate": 6.512524116523633e-06, + "loss": 0.8634, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.44100918029650976, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6763, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.37502419948636284, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6432, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.5334434133365746, + "learning_rate": 6.239314990243339e-06, + "loss": 0.6804, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.47365930471855267, + "learning_rate": 6.149504395842087e-06, + "loss": 0.7115, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.6031283485154669, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.6359, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4574835901358478, + "learning_rate": 5.971775505458444e-06, + "loss": 0.7119, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.5026677386432215, + "learning_rate": 5.883858403607967e-06, + "loss": 0.6995, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.543562020854025, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7052, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.5899728969075093, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.731, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.37569145101022755, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6681, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.5412787308770931, + "learning_rate": 5.538519351897575e-06, + "loss": 0.7451, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4345081969623715, + "learning_rate": 5.453769828241872e-06, + "loss": 0.722, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.3891793799421602, + "learning_rate": 5.369655545525909e-06, + "loss": 0.641, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.6988491234269987, + "learning_rate": 5.286177068899989e-06, + "loss": 0.8525, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.4721345087077115, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.7629, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.5047474078895655, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7768, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.6228980544298343, + "learning_rate": 5.039562062965508e-06, + "loss": 0.8038, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.5415475953360327, + "learning_rate": 4.95863237670956e-06, + "loss": 0.7996, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.47207787846432325, + "learning_rate": 4.87834125814235e-06, + "loss": 0.7772, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.5748806476455358, + "learning_rate": 4.798689246727006e-06, + "loss": 0.811, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.4220631995351488, + "learning_rate": 4.719676877632639e-06, + "loss": 0.6474, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.446191054611265, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6904, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.4354242087444171, + "learning_rate": 4.563573185591219e-06, + "loss": 0.6407, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.5315934381080039, + "learning_rate": 4.486482911479839e-06, + "loss": 0.7782, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.5308006888632717, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.7973, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4714340989308455, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6507, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.4665239385162486, + "learning_rate": 4.259064579323302e-06, + "loss": 0.672, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4634796827042525, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6559, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.4419499418289889, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.6888, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.49012435287452877, + "learning_rate": 4.037435632986786e-06, + "loss": 0.8121, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.41619064786271287, + "learning_rate": 3.964848174174541e-06, + "loss": 0.7139, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.4668223284459053, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7529, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.5075018339710508, + "learning_rate": 3.821609474213983e-06, + "loss": 0.7548, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.5599300622049045, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7873, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.47822143605307643, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.7526, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.383553168711321, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6544, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.5037697991826849, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.7349, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.5003655799335224, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.7502, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.5214482615730851, + "learning_rate": 3.40741737109318e-06, + "loss": 0.7391, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.5721289899083863, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7481, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.5026848248839152, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.7492, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5282502110567238, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6344, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.5862578699477898, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.8543, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.41667306780092783, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6662, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.40082055089727653, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.6595, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.550060804819665, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.847, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.5639909001781118, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.7041, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.5914599041965697, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.739, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.5320383614652344, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.7595, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.5121040065383884, + "learning_rate": 2.708812932856253e-06, + "loss": 0.649, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.5353742601330876, + "learning_rate": 2.649217248223468e-06, + "loss": 0.8626, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.5437078836728698, + "learning_rate": 2.590275647868867e-06, + "loss": 0.8018, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.4711045843340895, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6585, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.41446496435810776, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.8136, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.47359698246292387, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.6875, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.481888636253016, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7655, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.48866025654384593, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.7292, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5234734449607475, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7774, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.45442866945514243, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.744, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.5707376554409354, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7974, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.47615273754341614, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.722, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.5980890476489649, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7247, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.5078096465834021, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.8045, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4942281025658662, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.8084, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.5603283271593273, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.7097, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.524348482958465, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.736, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.5103150111409667, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.7533, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.47126320658347026, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6255, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.4374595099907046, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6519, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.4303500396061869, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6718, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.5027048542692559, + "learning_rate": 1.595161589389449e-06, + "loss": 0.7632, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.40345310504503545, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6327, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.5235635729752205, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.7656, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.46775735024885157, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6709, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.44824632687201565, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.6463, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.43119055221852337, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7009, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.49913531126759625, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.6509, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.5102335232179838, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7075, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.4969901617512161, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.8466, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.3780136813925014, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6087, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.5465231620290661, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.7149, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.49426935806759537, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.7381, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.4569170953485425, + "learning_rate": 1.089491988176017e-06, + "loss": 0.7881, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.5041880961100881, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.7477, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.40728285040341056, + "learning_rate": 1.014505010326583e-06, + "loss": 0.7686, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5505105644553496, + "learning_rate": 9.780089980330642e-07, + "loss": 0.8215, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.5172856369067419, + "learning_rate": 9.421782985976068e-07, + "loss": 0.7142, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.499399811683604, + "learning_rate": 9.070131527609604e-07, + "loss": 0.7746, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.5647289041005384, + "learning_rate": 8.725137967920738e-07, + "loss": 0.8716, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.558860908349905, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7367, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.4962091966102615, + "learning_rate": 8.055133771652345e-07, + "loss": 0.718, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.5761548093582705, + "learning_rate": 7.730127636723539e-07, + "loss": 0.7649, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.489217874580784, + "learning_rate": 7.411788403743237e-07, + "loss": 0.6173, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.49397258325557336, + "learning_rate": 7.100118211581852e-07, + "loss": 0.7394, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.4770680027485727, + "learning_rate": 6.7951191543012e-07, + "loss": 0.7152, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.408646861827668, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6599, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.4389837381985532, + "learning_rate": 6.205142596505176e-07, + "loss": 0.7895, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.44752592832878274, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6156, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.5836464970149965, + "learning_rate": 5.64187458615939e-07, + "loss": 0.8533, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.45473012560542037, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7159, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.5057251789768997, + "learning_rate": 5.105330261267916e-07, + "loss": 0.6794, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4317512882835771, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6841, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.5449199765588195, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.8055, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.49405887408280086, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.7087, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.7519298947851403, + "learning_rate": 4.112469628438365e-07, + "loss": 1.0253, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.47327704708410107, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6788, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.5334523235482905, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.6963, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.527512599760187, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.8435, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.7552247746233401, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.8224, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.4870892290302616, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.76, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.504087526922365, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.6952, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4738775389398773, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6668, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.45543660736999125, + "learning_rate": 2.448018893333681e-07, + "loss": 0.7908, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4975029393577986, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.7367, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.5299062482370205, + "learning_rate": 2.098903854912515e-07, + "loss": 0.7523, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4619442596125995, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.763, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.4146244321982028, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.6935, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.5298289674346027, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.7382, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.3724034691465779, + "learning_rate": 1.481139151579991e-07, + "loss": 0.6287, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.5051314347173688, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.7146, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.44418282139927207, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6672, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.5281467347436645, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6843, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.5394669528581812, + "learning_rate": 9.707157531134713e-08, + "loss": 0.8433, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.673443803006134, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7906, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.473550295777955, + "learning_rate": 7.557746412468758e-08, + "loss": 0.6912, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.49779496772024334, + "learning_rate": 6.583743778106887e-08, + "loss": 0.7571, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.4834002939644358, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.7499, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.5449031268397762, + "learning_rate": 4.837177080119215e-08, + "loss": 0.777, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.46602851849871574, + "learning_rate": 4.064624751394242e-08, + "loss": 0.7342, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.5296666070983255, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7446, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.47795206546734526, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.6973, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.48428904753018837, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6719, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.5101617903481329, + "learning_rate": 1.646071422083395e-08, + "loss": 0.7046, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5035560136801784, + "learning_rate": 1.209367398504746e-08, + "loss": 0.7437, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.4949316018847181, + "learning_rate": 8.398436437317969e-09, + "loss": 0.8164, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4514666113382206, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7741, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.4555143143258106, + "learning_rate": 3.023464202944748e-09, + "loss": 0.7282, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.5364032120210537, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.781, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.4292570766269159, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.6679, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.49169442964223203, + "learning_rate": 0.0, + "loss": 0.7719, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1011338945232896.0, + "train_loss": 0.8072394575595856, + "train_runtime": 18634.52, + "train_samples_per_second": 1.073, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1011338945232896.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..510a6be2b102e7665c940a276211fb291b0a8b18 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "v_proj", + "q_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7f54ce1a27d1f0d933d828f6ccd9ba59740825b4 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b28e891b8dc636a042936948931b6fd50b267e56a0860ba391f734347487bd74 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..c8ab5f20ba85db1903a8ade3f488519eae8f383a --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd8b411e68e555a9e54f30cd99932b89ebc7ea20d16427f6b11e184de776b61d +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..503ead140761ab7a0e61d00bee04ac438dcc29c4 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.9080813842870766, + "learning_rate": 5.263157894736842e-06, + "loss": 1.3037, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.8041941309224327, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.1017, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 1.2219852888632978, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.4947, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 1.0625205619778837, + "learning_rate": 2.105263157894737e-05, + "loss": 1.405, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.9969124865984521, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.3029, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.7206323835164766, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3023, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.7470650917108264, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.2665, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.948707380340728, + "learning_rate": 4.210526315789474e-05, + "loss": 1.3803, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.8089592105967153, + "learning_rate": 4.736842105263158e-05, + "loss": 1.1756, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 0.8676067765283894, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0588, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 1.263092953497496, + "learning_rate": 5.789473684210527e-05, + "loss": 1.248, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.9473292732665697, + "learning_rate": 6.31578947368421e-05, + "loss": 1.1317, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8477610058044885, + "learning_rate": 6.842105263157895e-05, + "loss": 1.0623, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7667070066497372, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0126, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.9625049180961249, + "learning_rate": 7.894736842105263e-05, + "loss": 1.1966, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.8126893313366784, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9224, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.5998893295288562, + "learning_rate": 8.947368421052632e-05, + "loss": 0.8809, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5525965962771149, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9377, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.6807763485523121, + "learning_rate": 0.0001, + "loss": 1.0672, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.5174866286955669, + "learning_rate": 0.00010526315789473685, + "loss": 0.9056, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6592880038492572, + "learning_rate": 0.0001105263157894737, + "loss": 1.0465, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6491568093567391, + "learning_rate": 0.00011578947368421053, + "loss": 1.0704, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5331666347350892, + "learning_rate": 0.00012105263157894738, + "loss": 0.8278, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5730405073227411, + "learning_rate": 0.0001263157894736842, + "loss": 0.8971, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.5443068513791588, + "learning_rate": 0.00013157894736842108, + "loss": 0.8812, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5864765390444897, + "learning_rate": 0.0001368421052631579, + "loss": 0.9682, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.6398113996329627, + "learning_rate": 0.00014210526315789474, + "loss": 0.929, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6544333191146848, + "learning_rate": 0.00014736842105263158, + "loss": 1.0644, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.56998721759652, + "learning_rate": 0.00015263157894736845, + "loss": 0.8449, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.576332085668357, + "learning_rate": 0.00015789473684210527, + "loss": 0.8513, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.6675100680815883, + "learning_rate": 0.0001631578947368421, + "loss": 0.94, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5728044946917537, + "learning_rate": 0.00016842105263157895, + "loss": 0.8683, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.8221054156386518, + "learning_rate": 0.0001736842105263158, + "loss": 1.0743, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.4980006872192115, + "learning_rate": 0.00017894736842105264, + "loss": 0.9559, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.6054537884300073, + "learning_rate": 0.00018421052631578948, + "loss": 1.0058, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5007100149127811, + "learning_rate": 0.00018947368421052632, + "loss": 0.8745, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.5122902755716746, + "learning_rate": 0.00019473684210526317, + "loss": 0.8939, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.6651795910026952, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.7491880786339893, + "learning_rate": 0.00019999966405802826, + "loss": 1.0938, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.5636400991503409, + "learning_rate": 0.00019999865623437013, + "loss": 0.8543, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.6360793584421368, + "learning_rate": 0.00019999697653579705, + "loss": 0.9355, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5589998853524988, + "learning_rate": 0.00019999462497359466, + "loss": 0.9103, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.5226562362394936, + "learning_rate": 0.0001999916015635627, + "loss": 0.7912, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.540808102334626, + "learning_rate": 0.00019998790632601496, + "loss": 0.8746, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.6609373211062586, + "learning_rate": 0.00019998353928577919, + "loss": 0.9303, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.6490434451115461, + "learning_rate": 0.0001999785004721968, + "loss": 0.9519, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.6205685785171462, + "learning_rate": 0.0001999727899191228, + "loss": 0.985, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.6233932799073175, + "learning_rate": 0.00019996640766492543, + "loss": 0.9277, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.5532631479296956, + "learning_rate": 0.00019995935375248606, + "loss": 0.9422, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.5500345487184016, + "learning_rate": 0.00019995162822919883, + "loss": 0.8876, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.5753158846253149, + "learning_rate": 0.00019994323114697022, + "loss": 0.9135, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5410274988154797, + "learning_rate": 0.00019993416256221895, + "loss": 0.9717, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.4639919243788228, + "learning_rate": 0.0001999244225358753, + "loss": 0.8245, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.6341811700951703, + "learning_rate": 0.00019991401113338104, + "loss": 0.8575, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.6186586598120916, + "learning_rate": 0.00019990292842468868, + "loss": 1.0047, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6975096235960725, + "learning_rate": 0.00019989117448426108, + "loss": 1.0039, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.6137359431792841, + "learning_rate": 0.0001998787493910712, + "loss": 0.9686, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.56801863237565, + "learning_rate": 0.00019986565322860115, + "loss": 0.9022, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.6280518664851908, + "learning_rate": 0.000199851886084842, + "loss": 0.9231, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.47910875481119425, + "learning_rate": 0.00019983744805229296, + "loss": 0.8217, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.5688670302130326, + "learning_rate": 0.00019982233922796085, + "loss": 0.9648, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.47222584023221503, + "learning_rate": 0.00019980655971335945, + "loss": 0.8544, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.48411043524638503, + "learning_rate": 0.00019979010961450878, + "loss": 0.8043, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5523628206241382, + "learning_rate": 0.00019977298904193437, + "loss": 0.7661, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.606102474993822, + "learning_rate": 0.00019975519811066663, + "loss": 0.88, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.47264111938475795, + "learning_rate": 0.00019973673694024, + "loss": 0.7791, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.6030545368737588, + "learning_rate": 0.0001997176056546921, + "loss": 0.7877, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5380818452529288, + "learning_rate": 0.00019969780438256293, + "loss": 0.8597, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.5668178890984674, + "learning_rate": 0.0001996773332568941, + "loss": 0.9309, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.7492229436319809, + "learning_rate": 0.0001996561924152278, + "loss": 0.8776, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.45594386398155734, + "learning_rate": 0.00019963438199960599, + "loss": 0.8762, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.6689489280264908, + "learning_rate": 0.0001996119021565693, + "loss": 0.7738, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.5824962388090401, + "learning_rate": 0.00019958875303715615, + "loss": 0.9494, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.6651407546192796, + "learning_rate": 0.0001995649347969019, + "loss": 0.9906, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.5912319175461134, + "learning_rate": 0.0001995404475958373, + "loss": 0.9225, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5877386519859602, + "learning_rate": 0.00019951529159848805, + "loss": 0.8798, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.586893525784064, + "learning_rate": 0.0001994894669738732, + "loss": 0.8399, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4990675239565995, + "learning_rate": 0.00019946297389550433, + "loss": 0.8533, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.5485876587251447, + "learning_rate": 0.0001994358125413841, + "loss": 0.8259, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.5101625540764176, + "learning_rate": 0.00019940798309400526, + "loss": 0.8224, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.5053166186801189, + "learning_rate": 0.0001993794857403495, + "loss": 0.8231, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.6467634237745158, + "learning_rate": 0.0001993503206718859, + "loss": 0.9116, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.5479339960363587, + "learning_rate": 0.0001993204880845699, + "loss": 0.9351, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5165396779025556, + "learning_rate": 0.00019928998817884182, + "loss": 0.8045, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.4662633534647873, + "learning_rate": 0.00019925882115962568, + "loss": 0.8121, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.548071998854012, + "learning_rate": 0.00019922698723632767, + "loss": 0.8916, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.4529264571570421, + "learning_rate": 0.00019919448662283478, + "loss": 0.8347, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6098404262902024, + "learning_rate": 0.00019916131953751342, + "loss": 0.9271, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.5601937657742456, + "learning_rate": 0.00019912748620320794, + "loss": 0.8876, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.522349236836615, + "learning_rate": 0.00019909298684723904, + "loss": 0.8808, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.5918162051866122, + "learning_rate": 0.00019905782170140238, + "loss": 0.9784, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.48353602339655577, + "learning_rate": 0.00019902199100196697, + "loss": 0.8732, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.563166983744731, + "learning_rate": 0.00019898549498967343, + "loss": 0.9462, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.48223871559795334, + "learning_rate": 0.00019894833390973266, + "loss": 0.8935, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.5142521212646851, + "learning_rate": 0.000198910508011824, + "loss": 0.8258, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.6175859185606641, + "learning_rate": 0.00019887201755009357, + "loss": 0.9889, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.5818806624503104, + "learning_rate": 0.00019883286278315262, + "loss": 0.8124, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.559895831107885, + "learning_rate": 0.0001987930439740757, + "loss": 0.868, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.6709827607983061, + "learning_rate": 0.00019875256139039902, + "loss": 1.036, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.45243485733229544, + "learning_rate": 0.00019871141530411853, + "loss": 0.7627, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.683941710270608, + "learning_rate": 0.00019866960599168826, + "loss": 0.9704, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.6248216468352323, + "learning_rate": 0.0001986271337340182, + "loss": 1.1019, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.5535038671388882, + "learning_rate": 0.0001985839988164726, + "loss": 0.8644, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5286991560003955, + "learning_rate": 0.00019854020152886814, + "loss": 0.8586, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.7117394642571703, + "learning_rate": 0.00019849574216547171, + "loss": 0.9799, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5948683491083182, + "learning_rate": 0.0001984506210249986, + "loss": 0.8904, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.5310452065515473, + "learning_rate": 0.00019840483841061058, + "loss": 0.8672, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.6362836731096708, + "learning_rate": 0.00019835839462991361, + "loss": 0.8982, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.6631034346390334, + "learning_rate": 0.00019831128999495606, + "loss": 0.9353, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.596396075120275, + "learning_rate": 0.00019826352482222638, + "loss": 0.7807, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.6183944289155431, + "learning_rate": 0.0001982150994326511, + "loss": 0.9543, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.559201812127621, + "learning_rate": 0.00019816601415159263, + "loss": 0.9205, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.5083531814012113, + "learning_rate": 0.0001981162693088471, + "loss": 0.7971, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5671801338505673, + "learning_rate": 0.0001980658652386421, + "loss": 0.924, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.6865308042995342, + "learning_rate": 0.0001980148022796345, + "loss": 0.8445, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.5813178583612161, + "learning_rate": 0.00019796308077490817, + "loss": 0.9226, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.5518225766620017, + "learning_rate": 0.00019791070107197153, + "loss": 0.8605, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.5459956040550069, + "learning_rate": 0.00019785766352275542, + "loss": 0.8972, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.5806101685858397, + "learning_rate": 0.0001978039684836106, + "loss": 0.9246, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.6021871429951688, + "learning_rate": 0.00019774961631530545, + "loss": 0.9122, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.5962927746824843, + "learning_rate": 0.0001976946073830234, + "loss": 0.9049, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4523411532532136, + "learning_rate": 0.00019763894205636072, + "loss": 0.8217, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.5865078340068325, + "learning_rate": 0.00019758262070932375, + "loss": 0.9294, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.45349956904909167, + "learning_rate": 0.00019752564372032657, + "loss": 0.7856, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.517464263535899, + "learning_rate": 0.00019746801147218842, + "loss": 0.9466, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.6575914482573735, + "learning_rate": 0.00019740972435213115, + "loss": 1.0001, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.5446715985980961, + "learning_rate": 0.00019735078275177654, + "loss": 0.943, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.7043591135383009, + "learning_rate": 0.00019729118706714375, + "loss": 0.9803, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.6092843978143063, + "learning_rate": 0.00019723093769864663, + "loss": 0.8894, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.6176019096628385, + "learning_rate": 0.00019717003505109095, + "loss": 0.9069, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.6835264015786288, + "learning_rate": 0.0001971084795336719, + "loss": 1.055, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.6602792610715091, + "learning_rate": 0.00019704627155997108, + "loss": 0.9217, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.7051153166006852, + "learning_rate": 0.00019698341154795389, + "loss": 0.9149, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5291715643966506, + "learning_rate": 0.00019691989991996663, + "loss": 0.8375, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.8489485432390967, + "learning_rate": 0.00019685573710273376, + "loss": 0.9701, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.7030018259258961, + "learning_rate": 0.0001967909235273549, + "loss": 1.0476, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.41150624043839457, + "learning_rate": 0.00019672545962930215, + "loss": 0.7894, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.6158901773012407, + "learning_rate": 0.00019665934584841682, + "loss": 0.9524, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.5306678189960342, + "learning_rate": 0.00019659258262890683, + "loss": 0.8594, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.5973439965714, + "learning_rate": 0.00019652517041934356, + "loss": 0.8997, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.562538296474552, + "learning_rate": 0.00019645710967265882, + "loss": 0.8929, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5964221253209941, + "learning_rate": 0.00019638840084614182, + "loss": 0.9702, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.6133126092378702, + "learning_rate": 0.00019631904440143612, + "loss": 0.9131, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5354137818878005, + "learning_rate": 0.00019624904080453655, + "loss": 0.8523, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.4640625679295804, + "learning_rate": 0.00019617839052578603, + "loss": 0.7739, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.5512099210314915, + "learning_rate": 0.00019610709403987246, + "loss": 0.9539, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.5112422245899192, + "learning_rate": 0.0001960351518258255, + "loss": 0.809, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5137823206631175, + "learning_rate": 0.00019596256436701324, + "loss": 0.8841, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.5535363768420065, + "learning_rate": 0.00019588933215113926, + "loss": 0.9786, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.716718046449404, + "learning_rate": 0.000195815455670239, + "loss": 0.9377, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.5273600234368598, + "learning_rate": 0.00019574093542067673, + "loss": 0.8908, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5659469100313295, + "learning_rate": 0.00019566577190314197, + "loss": 0.9273, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.5148339514433602, + "learning_rate": 0.0001955899656226464, + "loss": 0.8526, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.5668077761671138, + "learning_rate": 0.0001955135170885202, + "loss": 0.8812, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.5116988392192973, + "learning_rate": 0.0001954364268144088, + "loss": 0.8693, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5905348006861363, + "learning_rate": 0.00019535869531826937, + "loss": 0.9094, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.5206388139216594, + "learning_rate": 0.00019528032312236736, + "loss": 0.8352, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.6214901397689111, + "learning_rate": 0.00019520131075327298, + "loss": 0.9938, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.737160634050996, + "learning_rate": 0.00019512165874185767, + "loss": 0.9611, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.520320642155761, + "learning_rate": 0.00019504136762329047, + "loss": 0.824, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.6084476825668118, + "learning_rate": 0.0001949604379370345, + "loss": 0.8687, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.47688399456411, + "learning_rate": 0.00019487887022684336, + "loss": 0.8424, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.48499020012350524, + "learning_rate": 0.00019479666504075736, + "loss": 0.78, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.540156343662923, + "learning_rate": 0.00019471382293110003, + "loss": 0.9063, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.5516619124746854, + "learning_rate": 0.0001946303444544741, + "loss": 1.0197, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.6793270520647662, + "learning_rate": 0.00019454623017175812, + "loss": 0.9356, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.6449131975857184, + "learning_rate": 0.00019446148064810242, + "loss": 0.9387, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5909935070229813, + "learning_rate": 0.00019437609645292546, + "loss": 1.0363, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.48583088614787706, + "learning_rate": 0.00019429007815990993, + "loss": 0.8313, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.628898629621968, + "learning_rate": 0.0001942034263469989, + "loss": 0.8713, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.6085125380724073, + "learning_rate": 0.00019411614159639204, + "loss": 0.8776, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5975998757056563, + "learning_rate": 0.00019402822449454153, + "loss": 0.819, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.6390380149254054, + "learning_rate": 0.00019393967563214833, + "loss": 0.8032, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.46959578227076104, + "learning_rate": 0.00019385049560415794, + "loss": 0.7833, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.4762964480716448, + "learning_rate": 0.00019376068500975667, + "loss": 0.837, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5105373450821388, + "learning_rate": 0.00019367024445236754, + "loss": 0.8487, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.5468553130094058, + "learning_rate": 0.000193579174539646, + "loss": 0.8661, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.6907998813306322, + "learning_rate": 0.00019348747588347637, + "loss": 1.02, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.6055411372103042, + "learning_rate": 0.00019339514909996706, + "loss": 0.8596, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.49086621144765663, + "learning_rate": 0.00019330219480944694, + "loss": 0.8739, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.5263248779575109, + "learning_rate": 0.00019320861363646095, + "loss": 0.8587, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.5200412219564152, + "learning_rate": 0.00019311440620976597, + "loss": 0.9819, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.4774845980916723, + "learning_rate": 0.00019301957316232658, + "loss": 0.8869, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5959662925959347, + "learning_rate": 0.0001929241151313108, + "loss": 0.9383, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.9842793757570049, + "learning_rate": 0.0001928280327580858, + "loss": 0.8066, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.522478559416786, + "learning_rate": 0.00019273132668821364, + "loss": 0.8053, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.6353334165247837, + "learning_rate": 0.00019263399757144683, + "loss": 0.9887, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4735932083368219, + "learning_rate": 0.00019253604606172417, + "loss": 0.7937, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.675790623641156, + "learning_rate": 0.000192437472817166, + "loss": 0.9149, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.5456571446884041, + "learning_rate": 0.00019233827850007027, + "loss": 0.8176, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.6396353944202722, + "learning_rate": 0.00019223846377690754, + "loss": 0.8454, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.6232851227873076, + "learning_rate": 0.00019213802931831696, + "loss": 0.9224, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.5150570016172883, + "learning_rate": 0.00019203697579910154, + "loss": 0.7933, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.528689330359432, + "learning_rate": 0.00019193530389822363, + "loss": 0.812, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.4711657231777871, + "learning_rate": 0.00019183301429880043, + "loss": 0.8769, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5031768770285683, + "learning_rate": 0.00019173010768809933, + "loss": 0.7461, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.5753357539717228, + "learning_rate": 0.00019162658475753327, + "loss": 0.8839, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.47826080000077054, + "learning_rate": 0.0001915224462026563, + "loss": 0.7602, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.5331109078884061, + "learning_rate": 0.00019141769272315858, + "loss": 0.9127, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.513453303514768, + "learning_rate": 0.00019131232502286188, + "loss": 0.9073, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.5306547540075643, + "learning_rate": 0.00019120634380971496, + "loss": 0.909, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.5097127261566166, + "learning_rate": 0.0001910997497957885, + "loss": 0.9403, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.5219176059890506, + "learning_rate": 0.0001909925436972706, + "loss": 0.7665, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.47743145543637183, + "learning_rate": 0.00019088472623446183, + "loss": 0.8187, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.8883065430241422, + "learning_rate": 0.00019077629813177036, + "loss": 1.0057, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.7178760236368346, + "learning_rate": 0.00019066726011770726, + "loss": 1.003, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.5944311916972052, + "learning_rate": 0.00019055761292488142, + "loss": 0.8245, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5477617007454727, + "learning_rate": 0.0001904473572899947, + "loss": 0.7779, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.4945521838206525, + "learning_rate": 0.00019033649395383702, + "loss": 0.8471, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.49935690116121134, + "learning_rate": 0.00019022502366128135, + "loss": 0.8117, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.5422040702495826, + "learning_rate": 0.00019011294716127867, + "loss": 0.8864, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5525455231214514, + "learning_rate": 0.00019000026520685302, + "loss": 0.8636, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.4661463876181464, + "learning_rate": 0.0001898869785550963, + "loss": 0.8049, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.5076418761116616, + "learning_rate": 0.0001897730879671634, + "loss": 0.7897, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.4782565516179986, + "learning_rate": 0.00018965859420826684, + "loss": 0.8408, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4858303386449708, + "learning_rate": 0.00018954349804767184, + "loss": 0.8167, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.6632669237035178, + "learning_rate": 0.00018942780025869098, + "loss": 0.9443, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.5484973819890846, + "learning_rate": 0.00018931150161867916, + "loss": 0.8538, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.4925168999699814, + "learning_rate": 0.00018919460290902826, + "loss": 0.8905, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.5876668431809312, + "learning_rate": 0.00018907710491516199, + "loss": 0.8862, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.5314406517026361, + "learning_rate": 0.0001889590084265304, + "loss": 0.7883, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.6817510756689252, + "learning_rate": 0.0001888403142366049, + "loss": 0.9403, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.6989926738514176, + "learning_rate": 0.0001887210231428727, + "loss": 0.8941, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5252580793669265, + "learning_rate": 0.00018860113594683148, + "loss": 0.8209, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.5850348971319385, + "learning_rate": 0.0001884806534539841, + "loss": 0.8525, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5166293535713776, + "learning_rate": 0.00018835957647383303, + "loss": 0.7429, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.70187109719964, + "learning_rate": 0.0001882379058198751, + "loss": 0.891, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5867245972162806, + "learning_rate": 0.00018811564230959588, + "loss": 0.8451, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.5464230292978006, + "learning_rate": 0.00018799278676446423, + "loss": 0.8528, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.6041812836460903, + "learning_rate": 0.00018786934000992688, + "loss": 0.8383, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.4595506522936158, + "learning_rate": 0.00018774530287540278, + "loss": 0.789, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.6382950989160578, + "learning_rate": 0.00018762067619427746, + "loss": 0.8557, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.4611123108062563, + "learning_rate": 0.00018749546080389757, + "loss": 0.7605, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.595657447518986, + "learning_rate": 0.00018736965754556528, + "loss": 0.7815, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.4633808160151315, + "learning_rate": 0.00018724326726453244, + "loss": 0.8074, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.6901121421546078, + "learning_rate": 0.00018711629080999504, + "loss": 0.8163, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.5616861858844762, + "learning_rate": 0.00018698872903508755, + "loss": 0.9254, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4155470020456329, + "learning_rate": 0.00018686058279687698, + "loss": 0.7098, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.5129706886346935, + "learning_rate": 0.0001867318529563574, + "loss": 0.9079, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.518005849891447, + "learning_rate": 0.00018660254037844388, + "loss": 0.8289, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.4078707555014803, + "learning_rate": 0.00018647264593196688, + "loss": 0.7314, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.6197715152766672, + "learning_rate": 0.00018634217048966637, + "loss": 0.8875, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.5114345849769838, + "learning_rate": 0.00018621111492818585, + "loss": 0.8891, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.48995688769097845, + "learning_rate": 0.0001860794801280666, + "loss": 0.8496, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.44177139312483277, + "learning_rate": 0.00018594726697374175, + "loss": 0.7739, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5348604710766836, + "learning_rate": 0.0001858144763535302, + "loss": 0.8421, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.6622529640570501, + "learning_rate": 0.0001856811091596308, + "loss": 0.8967, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.42488582416266113, + "learning_rate": 0.0001855471662881164, + "loss": 0.7882, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.5272002452985016, + "learning_rate": 0.00018541264863892754, + "loss": 0.8122, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.6144604512161106, + "learning_rate": 0.00018527755711586678, + "loss": 0.8928, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.5690544597134083, + "learning_rate": 0.00018514189262659235, + "loss": 0.8953, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5218802931219912, + "learning_rate": 0.00018500565608261214, + "loss": 0.8486, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.4688014307919003, + "learning_rate": 0.00018486884839927768, + "loss": 0.8059, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5704523846582193, + "learning_rate": 0.00018473147049577774, + "loss": 0.8298, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.5382313464182708, + "learning_rate": 0.0001845935232951325, + "loss": 0.8604, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.7253872353623282, + "learning_rate": 0.00018445500772418697, + "loss": 1.0071, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.5985727808398382, + "learning_rate": 0.00018431592471360503, + "loss": 0.9913, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.5356362639095812, + "learning_rate": 0.00018417627519786315, + "loss": 0.8476, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.48684948291038294, + "learning_rate": 0.000184036060115244, + "loss": 0.8489, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.44091418628157353, + "learning_rate": 0.00018389528040783012, + "loss": 0.8342, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.45212614639546883, + "learning_rate": 0.00018375393702149787, + "loss": 0.7955, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5450685948821071, + "learning_rate": 0.00018361203090591071, + "loss": 0.8632, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.4666481729146395, + "learning_rate": 0.00018346956301451304, + "loss": 0.7784, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5425279104506674, + "learning_rate": 0.00018332653430452376, + "loss": 0.8956, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.6870473826461702, + "learning_rate": 0.00018318294573692985, + "loss": 1.0676, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.5810030337078149, + "learning_rate": 0.00018303879827647975, + "loss": 0.7934, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.6519705109175864, + "learning_rate": 0.0001828940928916772, + "loss": 1.0016, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.6754890438239762, + "learning_rate": 0.00018274883055477436, + "loss": 0.8099, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.721804852693335, + "learning_rate": 0.00018260301224176558, + "loss": 0.942, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.5476317566123734, + "learning_rate": 0.00018245663893238075, + "loss": 0.9227, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.5793651042918275, + "learning_rate": 0.00018230971161007853, + "loss": 0.8189, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5445884292655935, + "learning_rate": 0.00018216223126204007, + "loss": 0.7936, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.523425477223313, + "learning_rate": 0.00018201419887916214, + "loss": 0.8962, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.7693345684662526, + "learning_rate": 0.00018186561545605054, + "loss": 0.988, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.6309561995456173, + "learning_rate": 0.00018171648199101346, + "loss": 0.8887, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4918388328990002, + "learning_rate": 0.00018156679948605467, + "loss": 0.8453, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.5032161536321187, + "learning_rate": 0.00018141656894686689, + "loss": 0.8303, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.6350141023434128, + "learning_rate": 0.00018126579138282503, + "loss": 0.9384, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.5046679114657476, + "learning_rate": 0.00018111446780697929, + "loss": 0.8422, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.6290049439223223, + "learning_rate": 0.0001809625992360485, + "loss": 0.9196, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.5724433204626115, + "learning_rate": 0.00018081018669041324, + "loss": 0.8146, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5571221248924423, + "learning_rate": 0.00018065723119410884, + "loss": 0.8749, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.6030624668703969, + "learning_rate": 0.00018050373377481878, + "loss": 0.9406, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.5112945820734779, + "learning_rate": 0.00018034969546386757, + "loss": 0.8608, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.4452627787024938, + "learning_rate": 0.0001801951172962139, + "loss": 0.7551, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.5760706488202555, + "learning_rate": 0.0001800400003104436, + "loss": 0.8301, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.4719479158394112, + "learning_rate": 0.0001798843455487629, + "loss": 0.8355, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5311096217325775, + "learning_rate": 0.00017972815405699103, + "loss": 0.8228, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.6070868831248841, + "learning_rate": 0.00017957142688455362, + "loss": 0.8855, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.5910494736537589, + "learning_rate": 0.00017941416508447536, + "loss": 0.8853, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.49561716542204515, + "learning_rate": 0.00017925636971337304, + "loss": 0.8828, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5539093378586168, + "learning_rate": 0.0001790980418314484, + "loss": 0.9094, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.5477538088346802, + "learning_rate": 0.00017893918250248104, + "loss": 0.8865, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.4932579666169797, + "learning_rate": 0.00017877979279382135, + "loss": 0.7896, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.5939639494694906, + "learning_rate": 0.00017861987377638312, + "loss": 0.8902, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.43342525098781615, + "learning_rate": 0.0001784594265246366, + "loss": 0.7462, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.5679787973019467, + "learning_rate": 0.0001782984521166011, + "loss": 0.9087, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.7676538546927215, + "learning_rate": 0.0001781369516338378, + "loss": 0.9968, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.5628902043632186, + "learning_rate": 0.00017797492616144256, + "loss": 0.8499, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.6251348859155498, + "learning_rate": 0.00017781237678803847, + "loss": 0.8519, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.5419026827932641, + "learning_rate": 0.00017764930460576866, + "loss": 0.8473, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4681480734347631, + "learning_rate": 0.000177485710710289, + "loss": 0.7225, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.7028670736176164, + "learning_rate": 0.00017732159620076053, + "loss": 0.879, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5177270541055513, + "learning_rate": 0.00017715696217984235, + "loss": 0.7814, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.4979783679728938, + "learning_rate": 0.00017699180975368396, + "loss": 0.8974, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.46330771611366944, + "learning_rate": 0.00017682614003191807, + "loss": 0.7494, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.5875281058129088, + "learning_rate": 0.00017665995412765285, + "loss": 0.8414, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5336432400910488, + "learning_rate": 0.00017649325315746478, + "loss": 0.8316, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.6295129876568483, + "learning_rate": 0.00017632603824139085, + "loss": 0.9598, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.5151871712060029, + "learning_rate": 0.0001761583105029213, + "loss": 0.826, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.637025894741612, + "learning_rate": 0.0001759900710689918, + "loss": 0.8772, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.44883188465552115, + "learning_rate": 0.00017582132106997616, + "loss": 0.7352, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.5037347267609362, + "learning_rate": 0.00017565206163967846, + "loss": 0.7849, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.46999643690579695, + "learning_rate": 0.00017548229391532572, + "loss": 0.8446, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.5486126383321374, + "learning_rate": 0.00017531201903755994, + "loss": 0.793, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4690195039420612, + "learning_rate": 0.00017514123815043074, + "loss": 0.7481, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.4469499837433491, + "learning_rate": 0.00017496995240138744, + "loss": 0.8287, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4704646478693022, + "learning_rate": 0.00017479816294127152, + "loss": 0.7376, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.5057383073850039, + "learning_rate": 0.00017462587092430875, + "loss": 0.8314, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.6310987292783401, + "learning_rate": 0.0001744530775081015, + "loss": 0.8951, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.5449317163004509, + "learning_rate": 0.00017427978385362112, + "loss": 0.9102, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.514102594990965, + "learning_rate": 0.0001741059911251997, + "loss": 0.8377, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.5559467329804153, + "learning_rate": 0.0001739317004905227, + "loss": 0.812, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.48821193891200365, + "learning_rate": 0.000173756913120621, + "loss": 0.8487, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.551004542218846, + "learning_rate": 0.00017358163018986282, + "loss": 0.8464, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5397512306039455, + "learning_rate": 0.00017340585287594604, + "loss": 0.83, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.48540731321502306, + "learning_rate": 0.00017322958235989016, + "loss": 0.809, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.861999686549182, + "learning_rate": 0.0001730528198260285, + "loss": 0.8021, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.5336204933791293, + "learning_rate": 0.00017287556646200018, + "loss": 0.8654, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.6022331354048954, + "learning_rate": 0.00017269782345874203, + "loss": 0.8526, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.4690650461656505, + "learning_rate": 0.00017251959201048083, + "loss": 0.8373, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.6677068500987228, + "learning_rate": 0.00017234087331472497, + "loss": 0.9869, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.5612166144020337, + "learning_rate": 0.00017216166857225674, + "loss": 0.8014, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.5268558472286875, + "learning_rate": 0.00017198197898712404, + "loss": 0.8929, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.5530749106640147, + "learning_rate": 0.00017180180576663228, + "loss": 0.8626, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5059780985423626, + "learning_rate": 0.00017162115012133643, + "loss": 0.8821, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.47676617124447485, + "learning_rate": 0.00017144001326503273, + "loss": 0.8224, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4620426249713613, + "learning_rate": 0.00017125839641475072, + "loss": 0.8328, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.5028953666722216, + "learning_rate": 0.00017107630079074478, + "loss": 0.8562, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.47652443163944547, + "learning_rate": 0.00017089372761648616, + "loss": 0.7072, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.6066152782490621, + "learning_rate": 0.00017071067811865476, + "loss": 0.9276, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.5528112731017201, + "learning_rate": 0.00017052715352713075, + "loss": 0.8803, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.4918047438378406, + "learning_rate": 0.00017034315507498635, + "loss": 0.8163, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4535225066788238, + "learning_rate": 0.00017015868399847768, + "loss": 0.7531, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.5375438968538784, + "learning_rate": 0.00016997374153703625, + "loss": 0.8982, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.5308274941140089, + "learning_rate": 0.00016978832893326074, + "loss": 0.8181, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.5609607519764662, + "learning_rate": 0.00016960244743290868, + "loss": 0.9398, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5618809545792961, + "learning_rate": 0.00016941609828488807, + "loss": 0.9222, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.49267474774738795, + "learning_rate": 0.00016922928274124886, + "loss": 0.7758, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.5455689194244441, + "learning_rate": 0.0001690420020571747, + "loss": 0.8552, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.4429486571180159, + "learning_rate": 0.00016885425749097444, + "loss": 0.7324, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5115503357592327, + "learning_rate": 0.0001686660503040737, + "loss": 0.7175, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.542034705746471, + "learning_rate": 0.00016847738176100632, + "loss": 0.8652, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.5700986682212333, + "learning_rate": 0.00016828825312940592, + "loss": 0.8818, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.6164349413931524, + "learning_rate": 0.0001680986656799975, + "loss": 0.8666, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.7599405188320725, + "learning_rate": 0.0001679086206865886, + "loss": 1.055, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.5167876671609728, + "learning_rate": 0.00016771811942606108, + "loss": 0.8582, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.5068026331777621, + "learning_rate": 0.00016752716317836229, + "loss": 0.7654, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.44707436209700097, + "learning_rate": 0.00016733575322649657, + "loss": 0.7804, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.4080835450589432, + "learning_rate": 0.0001671438908565167, + "loss": 0.7625, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.43926437412321234, + "learning_rate": 0.00016695157735751513, + "loss": 0.7466, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4050385558699055, + "learning_rate": 0.00016675881402161536, + "loss": 0.7511, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.5654226910960457, + "learning_rate": 0.0001665656021439633, + "loss": 0.8699, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5628593254082522, + "learning_rate": 0.0001663719430227186, + "loss": 0.8403, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.4425185566602141, + "learning_rate": 0.00016617783795904565, + "loss": 0.7543, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.5181238955708138, + "learning_rate": 0.00016598328825710533, + "loss": 0.7846, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.6049124328529043, + "learning_rate": 0.00016578829522404583, + "loss": 0.9702, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4578067005222997, + "learning_rate": 0.000165592860169994, + "loss": 0.7539, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.49441672286942956, + "learning_rate": 0.00016539698440804661, + "loss": 0.8168, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.4076323538241769, + "learning_rate": 0.00016520066925426144, + "loss": 0.7347, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.5343987743723532, + "learning_rate": 0.0001650039160276485, + "loss": 0.8339, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3896772629440146, + "learning_rate": 0.0001648067260501611, + "loss": 0.7062, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.5018808715344389, + "learning_rate": 0.0001646091006466871, + "loss": 0.7814, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.6303121820253016, + "learning_rate": 0.0001644110411450398, + "loss": 0.8944, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.4571025291047099, + "learning_rate": 0.00016421254887594917, + "loss": 0.749, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.6095006648443052, + "learning_rate": 0.00016401362517305296, + "loss": 0.9778, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.41866928759649463, + "learning_rate": 0.00016381427137288754, + "loss": 0.6948, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5119080641088787, + "learning_rate": 0.00016361448881487914, + "loss": 0.8283, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.4479758812016671, + "learning_rate": 0.0001634142788413346, + "loss": 0.7014, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.5338053773935066, + "learning_rate": 0.00016321364279743266, + "loss": 0.8415, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.5426629713846673, + "learning_rate": 0.00016301258203121462, + "loss": 0.9578, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4765158057171693, + "learning_rate": 0.0001628110978935756, + "loss": 0.8188, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.606064025013686, + "learning_rate": 0.00016260919173825508, + "loss": 0.7272, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.46583597442321, + "learning_rate": 0.00016240686492182804, + "loss": 0.747, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.49170481724700105, + "learning_rate": 0.00016220411880369601, + "loss": 0.7232, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.6127045362104712, + "learning_rate": 0.00016200095474607753, + "loss": 0.8813, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.5205613144936889, + "learning_rate": 0.00016179737411399926, + "loss": 0.8447, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5048757231973682, + "learning_rate": 0.00016159337827528685, + "loss": 0.7569, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.5623118605368121, + "learning_rate": 0.00016138896860055555, + "loss": 0.855, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.43956424774060915, + "learning_rate": 0.0001611841464632011, + "loss": 0.7265, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.4214455019569896, + "learning_rate": 0.00016097891323939062, + "loss": 0.7722, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4784273054772774, + "learning_rate": 0.0001607732703080532, + "loss": 0.7692, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.5834222022611343, + "learning_rate": 0.00016056721905087056, + "loss": 0.8919, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.5106219005997958, + "learning_rate": 0.00016036076085226814, + "loss": 0.8209, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.569192961747009, + "learning_rate": 0.00016015389709940538, + "loss": 0.907, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5621626398180208, + "learning_rate": 0.0001599466291821666, + "loss": 0.8135, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.48478603551166005, + "learning_rate": 0.0001597389584931517, + "loss": 0.737, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.5159755890673956, + "learning_rate": 0.0001595308864276666, + "loss": 0.8217, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.4904250982213549, + "learning_rate": 0.0001593224143837142, + "loss": 0.8346, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.5040140792147169, + "learning_rate": 0.0001591135437619847, + "loss": 0.9222, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.45725979760283203, + "learning_rate": 0.00015890427596584617, + "loss": 0.7994, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.46310184538021054, + "learning_rate": 0.0001586946124013354, + "loss": 0.7577, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.4992877615928166, + "learning_rate": 0.00015848455447714822, + "loss": 0.8552, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4694773773028324, + "learning_rate": 0.0001582741036046301, + "loss": 0.8345, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.5713594558988307, + "learning_rate": 0.00015806326119776663, + "loss": 0.9252, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4806296902614307, + "learning_rate": 0.00015785202867317407, + "loss": 0.7922, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.48841973051979015, + "learning_rate": 0.00015764040745008988, + "loss": 0.7932, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4759306612833967, + "learning_rate": 0.00015742839895036305, + "loss": 0.8519, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.7113908699203894, + "learning_rate": 0.00015721600459844468, + "loss": 0.9058, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.5201090363863141, + "learning_rate": 0.00015700322582137827, + "loss": 0.8838, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.5090961313247779, + "learning_rate": 0.00015679006404879033, + "loss": 0.8772, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.6185209914381692, + "learning_rate": 0.0001565765207128805, + "loss": 0.9263, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.5579821090590285, + "learning_rate": 0.00015636259724841222, + "loss": 0.8662, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4182061811503812, + "learning_rate": 0.0001561482950927029, + "loss": 0.7299, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.5313234861260732, + "learning_rate": 0.00015593361568561428, + "loss": 0.9152, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5699139186225048, + "learning_rate": 0.00015571856046954285, + "loss": 0.6973, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.539483811029819, + "learning_rate": 0.0001555031308894101, + "loss": 0.899, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5149907818436924, + "learning_rate": 0.00015528732839265272, + "loss": 0.7605, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.6473197823751683, + "learning_rate": 0.0001550711544292131, + "loss": 0.8751, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.5248937236530749, + "learning_rate": 0.0001548546104515294, + "loss": 0.8775, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.5163199723121221, + "learning_rate": 0.00015463769791452574, + "loss": 0.8303, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5353908172771882, + "learning_rate": 0.00015442041827560274, + "loss": 0.813, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.4946626196922246, + "learning_rate": 0.00015420277299462736, + "loss": 0.8639, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5917448093868592, + "learning_rate": 0.00015398476353392323, + "loss": 0.9257, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.6444406512531591, + "learning_rate": 0.00015376639135826107, + "loss": 0.9207, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.45635652337347343, + "learning_rate": 0.00015354765793484834, + "loss": 0.781, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.4949993574665331, + "learning_rate": 0.00015332856473331978, + "loss": 0.8141, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.49600367041083593, + "learning_rate": 0.00015310911322572753, + "loss": 0.7994, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.7260207881282403, + "learning_rate": 0.00015288930488653094, + "loss": 0.9608, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.4720694087199054, + "learning_rate": 0.000152669141192587, + "loss": 0.8102, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.4847251100010436, + "learning_rate": 0.0001524486236231402, + "loss": 0.8206, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.6339699151906085, + "learning_rate": 0.00015222775365981273, + "loss": 0.7534, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.5331544476642404, + "learning_rate": 0.00015200653278659432, + "loss": 0.8609, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.381371833096432, + "learning_rate": 0.00015178496248983254, + "loss": 0.6884, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.45499991014417324, + "learning_rate": 0.00015156304425822267, + "loss": 0.7211, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4305793653382324, + "learning_rate": 0.00015134077958279765, + "loss": 0.7718, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.4969883493881603, + "learning_rate": 0.00015111816995691809, + "loss": 0.7574, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5599147758204644, + "learning_rate": 0.00015089521687626243, + "loss": 0.8541, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.7321788107994219, + "learning_rate": 0.00015067192183881658, + "loss": 0.8167, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.5037162560823695, + "learning_rate": 0.000150448286344864, + "loss": 0.7933, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.3798838749793671, + "learning_rate": 0.00015022431189697568, + "loss": 0.6813, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.6728978647696912, + "learning_rate": 0.00015000000000000001, + "loss": 0.9999, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.48579585049914625, + "learning_rate": 0.0001497753521610526, + "loss": 0.8079, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.6008468765117895, + "learning_rate": 0.00014955036988950618, + "loss": 0.8463, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.669263341740366, + "learning_rate": 0.00014932505469698052, + "loss": 1.0313, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.521168152258116, + "learning_rate": 0.00014909940809733222, + "loss": 0.8429, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.4890716157709086, + "learning_rate": 0.0001488734316066446, + "loss": 0.8428, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5917499099002614, + "learning_rate": 0.00014864712674321734, + "loss": 0.7181, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.3608371868458814, + "learning_rate": 0.0001484204950275565, + "loss": 0.6941, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.6327154948077564, + "learning_rate": 0.00014819353798236427, + "loss": 0.8313, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.5004531913410212, + "learning_rate": 0.00014796625713252848, + "loss": 0.747, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.5275165881107703, + "learning_rate": 0.00014773865400511272, + "loss": 0.7813, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.5222678604579563, + "learning_rate": 0.00014751073012934587, + "loss": 0.8047, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.5502306110907442, + "learning_rate": 0.00014728248703661182, + "loss": 0.7823, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.4002829564402694, + "learning_rate": 0.0001470539262604393, + "loss": 0.6921, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.5955630543614778, + "learning_rate": 0.00014682504933649144, + "loss": 0.9125, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.4775347402897507, + "learning_rate": 0.00014659585780255556, + "loss": 0.8071, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5417093409333327, + "learning_rate": 0.00014636635319853275, + "loss": 0.8894, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.4558793192299627, + "learning_rate": 0.0001461365370664276, + "loss": 0.7882, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.5501540489717972, + "learning_rate": 0.00014590641095033787, + "loss": 0.8402, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.5412326751618114, + "learning_rate": 0.00014567597639644387, + "loss": 0.8552, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.48300319301871625, + "learning_rate": 0.00014544523495299842, + "loss": 0.8592, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.439939727513309, + "learning_rate": 0.00014521418817031628, + "loss": 0.7545, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5902929394067173, + "learning_rate": 0.0001449828376007636, + "loss": 0.8344, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.4993027137143001, + "learning_rate": 0.00014475118479874774, + "loss": 0.787, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.6600975386600476, + "learning_rate": 0.0001445192313207067, + "loss": 0.8586, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.5292932095632575, + "learning_rate": 0.0001442869787250987, + "loss": 0.7707, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5408992613553245, + "learning_rate": 0.0001440544285723915, + "loss": 0.7343, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.4303682374368871, + "learning_rate": 0.00014382158242505234, + "loss": 0.7495, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.44958426243468036, + "learning_rate": 0.00014358844184753712, + "loss": 0.7517, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.4935174463060379, + "learning_rate": 0.00014335500840627986, + "loss": 0.7619, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5732863643126523, + "learning_rate": 0.00014312128366968243, + "loss": 0.9159, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.5341629172113251, + "learning_rate": 0.0001428872692081038, + "loss": 0.8164, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.4983179961540753, + "learning_rate": 0.00014265296659384956, + "loss": 0.7818, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.5476769205838636, + "learning_rate": 0.00014241837740116132, + "loss": 0.8014, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.519853059327151, + "learning_rate": 0.00014218350320620624, + "loss": 0.78, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.5122673066399209, + "learning_rate": 0.00014194834558706632, + "loss": 0.9465, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.4884806177478218, + "learning_rate": 0.0001417129061237278, + "loss": 0.7465, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.5507582946850637, + "learning_rate": 0.0001414771863980707, + "loss": 0.8738, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.42981490061468713, + "learning_rate": 0.00014124118799385796, + "loss": 0.7587, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.4197307756866334, + "learning_rate": 0.00014100491249672498, + "loss": 0.7554, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.505519660624182, + "learning_rate": 0.00014076836149416887, + "loss": 0.6974, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.39481543226001375, + "learning_rate": 0.0001405315365755379, + "loss": 0.7646, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5922524035330967, + "learning_rate": 0.0001402944393320206, + "loss": 0.8994, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.42693152478544344, + "learning_rate": 0.00014005707135663527, + "loss": 0.6892, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.8510976031521688, + "learning_rate": 0.00013981943424421932, + "loss": 0.9395, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.5474386064002674, + "learning_rate": 0.00013958152959141825, + "loss": 0.8637, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4789876152303442, + "learning_rate": 0.00013934335899667527, + "loss": 0.7921, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.5376569606274728, + "learning_rate": 0.00013910492406022033, + "loss": 0.839, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.4426845745081761, + "learning_rate": 0.00013886622638405952, + "loss": 0.6722, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.6345291879942117, + "learning_rate": 0.0001386272675719642, + "loss": 0.9571, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.5118545731832741, + "learning_rate": 0.00013838804922946027, + "loss": 0.7311, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.4960660282057398, + "learning_rate": 0.00013814857296381728, + "loss": 0.8514, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.539674335810636, + "learning_rate": 0.00013790884038403795, + "loss": 0.824, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.4800850704177175, + "learning_rate": 0.00013766885310084688, + "loss": 0.8428, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4351799728360236, + "learning_rate": 0.00013742861272668012, + "loss": 0.7292, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.43285418219243915, + "learning_rate": 0.00013718812087567414, + "loss": 0.8136, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.5216824155883797, + "learning_rate": 0.00013694737916365517, + "loss": 0.7571, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.5413483672953808, + "learning_rate": 0.000136706389208128, + "loss": 0.8638, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.6398444932660209, + "learning_rate": 0.00013646515262826552, + "loss": 0.9068, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.7444916635272512, + "learning_rate": 0.00013622367104489756, + "loss": 0.8793, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.6656926945246038, + "learning_rate": 0.0001359819460805001, + "loss": 0.9014, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.4457908869026868, + "learning_rate": 0.0001357399793591844, + "loss": 0.7344, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5697359549783958, + "learning_rate": 0.0001354977725066859, + "loss": 0.8217, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.5056678280708421, + "learning_rate": 0.00013525532715035366, + "loss": 0.8574, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.5484756774270771, + "learning_rate": 0.00013501264491913906, + "loss": 0.8512, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.46505218004814153, + "learning_rate": 0.00013476972744358507, + "loss": 0.7835, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.7119745829060073, + "learning_rate": 0.0001345265763558152, + "loss": 0.96, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.5449520855135285, + "learning_rate": 0.00013428319328952253, + "loss": 0.8196, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.47892233269452206, + "learning_rate": 0.00013403957987995882, + "loss": 0.7682, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.4848983418672802, + "learning_rate": 0.0001337957377639235, + "loss": 0.7724, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.5019430321638111, + "learning_rate": 0.0001335516685797525, + "loss": 0.83, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.4731872095334749, + "learning_rate": 0.0001333073739673076, + "loss": 0.7831, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.5077071364078982, + "learning_rate": 0.00013306285556796495, + "loss": 0.8324, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.5813701285846412, + "learning_rate": 0.0001328181150246045, + "loss": 0.8837, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4799917369825802, + "learning_rate": 0.00013257315398159864, + "loss": 0.8781, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.5159308332117593, + "learning_rate": 0.00013232797408480127, + "loss": 0.8789, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.5270262593076608, + "learning_rate": 0.00013208257698153677, + "loss": 0.8081, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.6180301887875884, + "learning_rate": 0.00013183696432058888, + "loss": 0.9297, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.6708371860981875, + "learning_rate": 0.00013159113775218964, + "loss": 0.8549, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.5707816679612164, + "learning_rate": 0.00013134509892800822, + "loss": 0.8378, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4923902211709554, + "learning_rate": 0.00013109884950114007, + "loss": 0.8792, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.5531441500947429, + "learning_rate": 0.00013085239112609547, + "loss": 0.9107, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4419632815828972, + "learning_rate": 0.00013060572545878875, + "loss": 0.7463, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.6061866078260476, + "learning_rate": 0.00013035885415652685, + "loss": 0.911, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.6963024819971977, + "learning_rate": 0.00013011177887799845, + "loss": 1.0253, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.49940481513481, + "learning_rate": 0.00012986450128326266, + "loss": 0.845, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5293551303353214, + "learning_rate": 0.00012961702303373795, + "loss": 0.8954, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.5063413990157742, + "learning_rate": 0.00012936934579219094, + "loss": 0.8538, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.5562949404718519, + "learning_rate": 0.00012912147122272523, + "loss": 0.7987, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.48131314061880803, + "learning_rate": 0.00012887340099077024, + "loss": 0.8152, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.5870675309038963, + "learning_rate": 0.00012862513676307008, + "loss": 1.0352, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.45971466569011993, + "learning_rate": 0.0001283766802076722, + "loss": 0.7168, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.6035126308260672, + "learning_rate": 0.00012812803299391628, + "loss": 0.8577, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.45144590547708024, + "learning_rate": 0.00012787919679242306, + "loss": 0.7752, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.39970917530382416, + "learning_rate": 0.00012763017327508305, + "loss": 0.7236, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.623590914707238, + "learning_rate": 0.00012738096411504522, + "loss": 0.7703, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.7515130857546064, + "learning_rate": 0.0001271315709867059, + "loss": 0.8271, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.4347546521826034, + "learning_rate": 0.00012688199556569753, + "loss": 0.7611, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.4761557601343759, + "learning_rate": 0.00012663223952887723, + "loss": 0.7657, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.44633336885236097, + "learning_rate": 0.0001263823045543158, + "loss": 0.7421, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.5563343452610224, + "learning_rate": 0.00012613219232128608, + "loss": 0.9967, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.6471019126729325, + "learning_rate": 0.00012588190451025207, + "loss": 0.8541, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.47340527937997834, + "learning_rate": 0.00012563144280285741, + "loss": 0.8474, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.46157400551191163, + "learning_rate": 0.00012538080888191408, + "loss": 0.7955, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.45508509589196, + "learning_rate": 0.00012513000443139112, + "loss": 0.7653, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.5077739722759781, + "learning_rate": 0.00012487903113640337, + "loss": 0.8425, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.8141208292900931, + "learning_rate": 0.00012462789068320017, + "loss": 0.948, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.4076706826329385, + "learning_rate": 0.00012437658475915377, + "loss": 0.6998, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.5118664937647278, + "learning_rate": 0.00012412511505274844, + "loss": 0.7696, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.4816229683616941, + "learning_rate": 0.00012387348325356874, + "loss": 0.7504, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.48930481676711723, + "learning_rate": 0.00012362169105228826, + "loss": 0.7651, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.45561086673747714, + "learning_rate": 0.00012336974014065844, + "loss": 0.7319, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.4416422478225898, + "learning_rate": 0.000123117632211497, + "loss": 0.7528, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.47418182478577525, + "learning_rate": 0.00012286536895867654, + "loss": 0.7091, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.6401681485409784, + "learning_rate": 0.00012261295207711346, + "loss": 0.9049, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.6982634373095091, + "learning_rate": 0.00012236038326275626, + "loss": 0.9297, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.507055788697773, + "learning_rate": 0.0001221076642125742, + "loss": 0.7311, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.5807270252926734, + "learning_rate": 0.00012185479662454595, + "loss": 0.7908, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.4431858492829446, + "learning_rate": 0.00012160178219764837, + "loss": 0.8259, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.4849528454606528, + "learning_rate": 0.00012134862263184467, + "loss": 0.7853, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.49166739471207144, + "learning_rate": 0.00012109531962807332, + "loss": 0.7069, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.5793994112069623, + "learning_rate": 0.00012084187488823657, + "loss": 0.8024, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.5179173280187043, + "learning_rate": 0.00012058829011518896, + "loss": 0.8411, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.6588825640045888, + "learning_rate": 0.00012033456701272576, + "loss": 0.8524, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.4623706090981975, + "learning_rate": 0.00012008070728557186, + "loss": 0.7722, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.47102123092467946, + "learning_rate": 0.00011982671263936995, + "loss": 0.8397, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4946138787683921, + "learning_rate": 0.00011957258478066931, + "loss": 0.8148, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.5486114882605688, + "learning_rate": 0.00011931832541691418, + "loss": 0.7791, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.4755744045856606, + "learning_rate": 0.00011906393625643244, + "loss": 0.7625, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.497584922002352, + "learning_rate": 0.00011880941900842397, + "loss": 0.8277, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.6326791729007039, + "learning_rate": 0.00011855477538294935, + "loss": 0.8543, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.45632696349487545, + "learning_rate": 0.00011830000709091815, + "loss": 0.801, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4239880741040958, + "learning_rate": 0.00011804511584407763, + "loss": 0.7306, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.5627946101804462, + "learning_rate": 0.0001177901033550012, + "loss": 0.7922, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5875614063404327, + "learning_rate": 0.00011753497133707679, + "loss": 0.9845, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.4552509273066403, + "learning_rate": 0.00011727972150449544, + "loss": 0.7373, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4164798887182463, + "learning_rate": 0.00011702435557223987, + "loss": 0.7783, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.5657442455430117, + "learning_rate": 0.00011676887525607271, + "loss": 0.7653, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.4009350967174863, + "learning_rate": 0.00011651328227252517, + "loss": 0.7872, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.5234546794812653, + "learning_rate": 0.00011625757833888551, + "loss": 0.8488, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4983773846295474, + "learning_rate": 0.00011600176517318741, + "loss": 0.7584, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.5481641154012888, + "learning_rate": 0.0001157458444941984, + "loss": 0.7982, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4353146816985957, + "learning_rate": 0.00011548981802140848, + "loss": 0.7253, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.451915751410752, + "learning_rate": 0.00011523368747501839, + "loss": 0.74, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.643640556182646, + "learning_rate": 0.00011497745457592816, + "loss": 0.8034, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.4514025547080541, + "learning_rate": 0.00011472112104572547, + "loss": 0.7941, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.5906026912192136, + "learning_rate": 0.00011446468860667421, + "loss": 0.8287, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.5653389985830627, + "learning_rate": 0.0001142081589817027, + "loss": 0.8367, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.5052738515343731, + "learning_rate": 0.00011395153389439233, + "loss": 0.7988, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.5161892275310288, + "learning_rate": 0.00011369481506896582, + "loss": 0.7462, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.42183424458748847, + "learning_rate": 0.00011343800423027582, + "loss": 0.7683, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.4048096584332975, + "learning_rate": 0.00011318110310379301, + "loss": 0.7521, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.42705254193421605, + "learning_rate": 0.0001129241134155949, + "loss": 0.7286, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.436739457475295, + "learning_rate": 0.00011266703689235394, + "loss": 0.7054, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.7333250606069377, + "learning_rate": 0.00011240987526132594, + "loss": 0.911, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.5989584703058212, + "learning_rate": 0.00011215263025033869, + "loss": 0.8636, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4987136821184533, + "learning_rate": 0.00011189530358778005, + "loss": 0.7914, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.5538093469915446, + "learning_rate": 0.00011163789700258655, + "loss": 0.8548, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.6225202094050077, + "learning_rate": 0.00011138041222423177, + "loss": 0.8493, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.48334131016487303, + "learning_rate": 0.00011112285098271451, + "loss": 0.7604, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.6200355119258774, + "learning_rate": 0.00011086521500854745, + "loss": 0.9533, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.4299026309261422, + "learning_rate": 0.00011060750603274535, + "loss": 0.7193, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.38030842079196864, + "learning_rate": 0.00011034972578681338, + "loss": 0.6811, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.5467270629092524, + "learning_rate": 0.00011009187600273566, + "loss": 0.8047, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.5068541306344564, + "learning_rate": 0.00010983395841296348, + "loss": 0.8656, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.6618083084463389, + "learning_rate": 0.00010957597475040373, + "loss": 0.8956, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.48105475118151797, + "learning_rate": 0.00010931792674840718, + "loss": 0.7298, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.5281532410356332, + "learning_rate": 0.00010905981614075693, + "loss": 0.7413, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.47052106414012906, + "learning_rate": 0.00010880164466165674, + "loss": 0.7982, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.5108428806336127, + "learning_rate": 0.00010854341404571928, + "loss": 0.8337, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.49989487284593914, + "learning_rate": 0.00010828512602795462, + "loss": 0.8609, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.47018363591967, + "learning_rate": 0.00010802678234375851, + "loss": 0.7778, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4238498581149461, + "learning_rate": 0.00010776838472890065, + "loss": 0.7382, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.690646476671765, + "learning_rate": 0.0001075099349195131, + "loss": 0.9422, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.5640038941521277, + "learning_rate": 0.00010725143465207867, + "loss": 0.9124, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.5565706526595023, + "learning_rate": 0.00010699288566341914, + "loss": 0.7769, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5550763692749047, + "learning_rate": 0.00010673428969068364, + "loss": 0.9181, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.45104601971076125, + "learning_rate": 0.000106475648471337, + "loss": 0.7894, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.4876402253325286, + "learning_rate": 0.00010621696374314807, + "loss": 0.7976, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.5019640463439436, + "learning_rate": 0.00010595823724417795, + "loss": 0.7927, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.43433623562478296, + "learning_rate": 0.00010569947071276847, + "loss": 0.7525, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.4234622503940986, + "learning_rate": 0.00010544066588753044, + "loss": 0.731, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.48766634284633525, + "learning_rate": 0.00010518182450733186, + "loss": 0.8656, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.4669931847818778, + "learning_rate": 0.00010492294831128641, + "loss": 0.7783, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.5642451353933601, + "learning_rate": 0.00010466403903874176, + "loss": 0.9083, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.583895853150987, + "learning_rate": 0.00010440509842926767, + "loss": 0.908, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.367874215358589, + "learning_rate": 0.00010414612822264455, + "loss": 0.6426, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.48006836951143605, + "learning_rate": 0.00010388713015885161, + "loss": 0.8314, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.6162029310047483, + "learning_rate": 0.00010362810597805526, + "loss": 0.8896, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.5537137865423145, + "learning_rate": 0.00010336905742059742, + "loss": 0.7685, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.6992075775089436, + "learning_rate": 0.0001031099862269837, + "loss": 0.9342, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.48711313691159946, + "learning_rate": 0.0001028508941378719, + "loss": 0.6825, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.5367535076245012, + "learning_rate": 0.00010259178289406011, + "loss": 0.7888, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.5051390864160392, + "learning_rate": 0.00010233265423647523, + "loss": 0.7454, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4393702088050236, + "learning_rate": 0.00010207350990616107, + "loss": 0.7754, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.4357646948380654, + "learning_rate": 0.00010181435164426676, + "loss": 0.8445, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.5655160401352094, + "learning_rate": 0.0001015551811920351, + "loss": 0.8535, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.7101369418409721, + "learning_rate": 0.00010129600029079072, + "loss": 0.8571, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.40500970171845685, + "learning_rate": 0.00010103681068192845, + "loss": 0.7254, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.4581178610983195, + "learning_rate": 0.00010077761410690172, + "loss": 0.7067, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.5095812716333972, + "learning_rate": 0.00010051841230721065, + "loss": 0.7643, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.6242672143526947, + "learning_rate": 0.00010025920702439051, + "loss": 0.8023, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.5095925423914407, + "learning_rate": 0.0001, + "loss": 0.8115, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.5205824194286691, + "learning_rate": 9.97407929756095e-05, + "loss": 0.8013, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.527927691195445, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6588, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.5480290855603533, + "learning_rate": 9.92223858930983e-05, + "loss": 0.7971, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4645180080695204, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7798, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.5170689062427818, + "learning_rate": 9.870399970920932e-05, + "loss": 0.8951, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.4998292336055176, + "learning_rate": 9.844481880796491e-05, + "loss": 0.743, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.5147063173006544, + "learning_rate": 9.818564835573323e-05, + "loss": 0.7651, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5726403336065479, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7562, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.4208878882710979, + "learning_rate": 9.766734576352478e-05, + "loss": 0.6844, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.5276694996108304, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7678, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.6036959186877415, + "learning_rate": 9.714910586212816e-05, + "loss": 0.8126, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.48157575829024774, + "learning_rate": 9.689001377301633e-05, + "loss": 0.8403, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.43205050772310427, + "learning_rate": 9.663094257940258e-05, + "loss": 0.6966, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4741793002877333, + "learning_rate": 9.637189402194476e-05, + "loss": 0.736, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.7250128268388728, + "learning_rate": 9.611286984114841e-05, + "loss": 0.9711, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.5192983059700997, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7779, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.4821371547428007, + "learning_rate": 9.559490157073236e-05, + "loss": 0.7436, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.5253102273548034, + "learning_rate": 9.533596096125825e-05, + "loss": 0.9746, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.4640170393820187, + "learning_rate": 9.507705168871358e-05, + "loss": 0.7189, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5415548697443237, + "learning_rate": 9.481817549266817e-05, + "loss": 0.9317, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.5463924966747006, + "learning_rate": 9.455933411246958e-05, + "loss": 0.9206, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.5597937045959331, + "learning_rate": 9.430052928723153e-05, + "loss": 0.9472, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.47190876781340824, + "learning_rate": 9.404176275582208e-05, + "loss": 0.7587, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.49711210331072075, + "learning_rate": 9.378303625685195e-05, + "loss": 0.878, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.5562096756407807, + "learning_rate": 9.352435152866298e-05, + "loss": 0.7232, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.38207380026345156, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6733, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.4911593464807909, + "learning_rate": 9.300711433658087e-05, + "loss": 0.848, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.49585021140361113, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7304, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.4361997781441195, + "learning_rate": 9.249006508048694e-05, + "loss": 0.8047, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.48749707290843575, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7379, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.5585774710293935, + "learning_rate": 9.197321765624152e-05, + "loss": 0.8264, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.6017964959135491, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7454, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.4903811752775937, + "learning_rate": 9.145658595428074e-05, + "loss": 0.7924, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.5826468987457163, + "learning_rate": 9.119835533834331e-05, + "loss": 0.8853, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.5576332717239038, + "learning_rate": 9.09401838592431e-05, + "loss": 0.8411, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.4666424712498503, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7513, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.392175235102589, + "learning_rate": 9.04240252495963e-05, + "loss": 0.7406, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.5988598795183644, + "learning_rate": 9.016604158703654e-05, + "loss": 0.8706, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.4549947867456565, + "learning_rate": 8.990812399726435e-05, + "loss": 0.6844, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4391528918297825, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7775, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.4968950226404202, + "learning_rate": 8.939249396725467e-05, + "loss": 0.7743, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.6191210643891846, + "learning_rate": 8.913478499145254e-05, + "loss": 0.9487, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.42276547342130816, + "learning_rate": 8.887714901728551e-05, + "loss": 0.7004, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.38049403358329675, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7383, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.6769621657629243, + "learning_rate": 8.836210299741346e-05, + "loss": 0.7771, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.8127544843604503, + "learning_rate": 8.810469641222001e-05, + "loss": 0.727, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.44351221474892843, + "learning_rate": 8.784736974966135e-05, + "loss": 0.6973, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4688048747951437, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7191, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.407535323330731, + "learning_rate": 8.733296310764611e-05, + "loss": 0.7516, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.6488950511252963, + "learning_rate": 8.707588658440511e-05, + "loss": 0.9266, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.5090581336710193, + "learning_rate": 8.6818896896207e-05, + "loss": 0.8334, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5102832703365258, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7681, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.4691147327015194, + "learning_rate": 8.63051849310342e-05, + "loss": 0.8435, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.4425746375769298, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6959, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.47816355139058525, + "learning_rate": 8.579184101829734e-05, + "loss": 0.8025, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.45335055516153394, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7181, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.5544787082237572, + "learning_rate": 8.527887895427454e-05, + "loss": 0.8752, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.5453965897883242, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7189, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.5688413162790307, + "learning_rate": 8.476631252498162e-05, + "loss": 0.9164, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.5261329886502011, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7751, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.5893462892684176, + "learning_rate": 8.425415550580162e-05, + "loss": 0.9216, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.6318813315358957, + "learning_rate": 8.399823482681262e-05, + "loss": 0.8991, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.5114713319230283, + "learning_rate": 8.374242166111448e-05, + "loss": 0.7628, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4994537707955278, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7752, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.5040668896437293, + "learning_rate": 8.323112474392731e-05, + "loss": 0.8209, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.42331110495934854, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7614, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.49310945050722876, + "learning_rate": 8.272027849550457e-05, + "loss": 0.8032, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4438640468537216, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7209, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.5539746222516068, + "learning_rate": 8.220989664499878e-05, + "loss": 0.8538, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.46660431947531117, + "learning_rate": 8.195488415592238e-05, + "loss": 0.8591, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.38022037294727395, + "learning_rate": 8.169999290908188e-05, + "loss": 0.6972, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5449499620640735, + "learning_rate": 8.144522461705067e-05, + "loss": 1.0068, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.4169898296910137, + "learning_rate": 8.119058099157604e-05, + "loss": 0.7496, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5203062122624942, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7352, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.5865678547802423, + "learning_rate": 8.068167458308582e-05, + "loss": 0.8105, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.44787543970841115, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6951, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.47026442836960747, + "learning_rate": 8.017328736063006e-05, + "loss": 0.7889, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.7319729212196735, + "learning_rate": 7.991929271442817e-05, + "loss": 0.8015, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.49252875503080346, + "learning_rate": 7.966543298727425e-05, + "loss": 0.8292, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5425899442850941, + "learning_rate": 7.941170988481108e-05, + "loss": 0.8161, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.4573563520297278, + "learning_rate": 7.915812511176347e-05, + "loss": 0.6923, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.6037199037531603, + "learning_rate": 7.89046803719267e-05, + "loss": 0.8714, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.6271526509531546, + "learning_rate": 7.865137736815535e-05, + "loss": 0.8068, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5301535703360196, + "learning_rate": 7.839821780235168e-05, + "loss": 0.8232, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.5787282734319904, + "learning_rate": 7.814520337545406e-05, + "loss": 0.7647, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.6614524640109779, + "learning_rate": 7.789233578742582e-05, + "loss": 0.9463, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.4926108007256998, + "learning_rate": 7.763961673724379e-05, + "loss": 0.7626, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4804850373978476, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7277, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.6747587556763391, + "learning_rate": 7.713463104132345e-05, + "loss": 0.8613, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.4768374021425166, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7535, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.42548895965153943, + "learning_rate": 7.663025985934158e-05, + "loss": 0.7386, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.527810233538241, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7931, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.3945006668802455, + "learning_rate": 7.61265167464313e-05, + "loss": 0.7499, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.5018677373299719, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7797, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.42433532115712186, + "learning_rate": 7.562341524084623e-05, + "loss": 0.7494, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.534041498650102, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7859, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.512688615851546, + "learning_rate": 7.512096886359664e-05, + "loss": 0.8271, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.511471743976831, + "learning_rate": 7.48699955686089e-05, + "loss": 0.8376, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.4894391809863964, + "learning_rate": 7.461919111808595e-05, + "loss": 0.8335, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.5514529144557104, + "learning_rate": 7.43685571971426e-05, + "loss": 0.8408, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.7223618761573463, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8607, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.5178506188237251, + "learning_rate": 7.386780767871397e-05, + "loss": 0.8074, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.42340531056837144, + "learning_rate": 7.361769544568425e-05, + "loss": 0.7043, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4964289596754657, + "learning_rate": 7.336776047112276e-05, + "loss": 0.728, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.47985426517246027, + "learning_rate": 7.311800443430251e-05, + "loss": 0.7996, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.4588102653566252, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7243, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.42569781572618853, + "learning_rate": 7.26190358849548e-05, + "loss": 0.7516, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.436780018977493, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7312, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.4628449287698282, + "learning_rate": 7.212080320757695e-05, + "loss": 0.7581, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.6621084612531626, + "learning_rate": 7.187196700608373e-05, + "loss": 0.895, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.5027121856510675, + "learning_rate": 7.162331979232783e-05, + "loss": 0.7909, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.48726573400549317, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6746, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.437989626752331, + "learning_rate": 7.112659900922976e-05, + "loss": 0.7123, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5906032482168247, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7483, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.5096323144956553, + "learning_rate": 7.06306542078091e-05, + "loss": 0.7342, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.404384369704306, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7219, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.47916413300610006, + "learning_rate": 7.013549871673736e-05, + "loss": 0.8172, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.5062713721851729, + "learning_rate": 6.988822112200156e-05, + "loss": 0.8235, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.4565530114216757, + "learning_rate": 6.964114584347316e-05, + "loss": 0.8377, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5444683418349793, + "learning_rate": 6.939427454121128e-05, + "loss": 0.8577, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.4475498314739535, + "learning_rate": 6.914760887390452e-05, + "loss": 0.668, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.6425774157029098, + "learning_rate": 6.890115049885994e-05, + "loss": 0.9029, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.5034987160910696, + "learning_rate": 6.865490107199181e-05, + "loss": 0.7365, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.6917235904630508, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7839, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.5605017746427178, + "learning_rate": 6.816303567941112e-05, + "loss": 0.713, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.4738673000646273, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7566, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.46540916421865014, + "learning_rate": 6.767202591519875e-05, + "loss": 0.6973, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.5410959645351576, + "learning_rate": 6.742684601840141e-05, + "loss": 0.8646, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.5960088786759402, + "learning_rate": 6.718188497539554e-05, + "loss": 0.8148, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.5077873608234905, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7357, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.6215644323546918, + "learning_rate": 6.669262603269246e-05, + "loss": 0.8109, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.6707416014463301, + "learning_rate": 6.644833142024751e-05, + "loss": 0.8331, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.5176939198395552, + "learning_rate": 6.620426223607654e-05, + "loss": 0.7384, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.4708689942638851, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7224, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.4930737896490059, + "learning_rate": 6.571680671047749e-05, + "loss": 0.8116, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.5892752746539001, + "learning_rate": 6.547342364418481e-05, + "loss": 0.9113, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.5044815712064862, + "learning_rate": 6.523027255641493e-05, + "loss": 0.7958, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.43719563585620835, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7293, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.5366190725514896, + "learning_rate": 6.474467284964634e-05, + "loss": 0.8898, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5082613561131746, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7893, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.41749140031129167, + "learning_rate": 6.426002064081565e-05, + "loss": 0.7309, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.5170497650763862, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7839, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.3961332652107408, + "learning_rate": 6.377632895510248e-05, + "loss": 0.7366, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.49754985494595544, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7653, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.548924998318957, + "learning_rate": 6.329361079187199e-05, + "loss": 0.7745, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.478178654903062, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7816, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.49895033099227376, + "learning_rate": 6.281187912432587e-05, + "loss": 0.8334, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.44381124234760644, + "learning_rate": 6.25713872733199e-05, + "loss": 0.8153, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.4529039481898601, + "learning_rate": 6.233114689915316e-05, + "loss": 0.7455, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.5426609709679072, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7634, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.6334191463283818, + "learning_rate": 6.18514270361827e-05, + "loss": 0.8432, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.6944304196751075, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7671, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.5278882305384767, + "learning_rate": 6.13727324280358e-05, + "loss": 0.7939, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.36665114582951297, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6488, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.4360921501249804, + "learning_rate": 6.08950759397797e-05, + "loss": 0.6543, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.46621713569092255, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6824, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.5515014444454384, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.7921, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.41036853011171437, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6904, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.5171660362125989, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.8403, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5072245001344229, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7688, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.37861571404640515, + "learning_rate": 5.946846342446214e-05, + "loss": 0.6413, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.5380753036323828, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7867, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.4303997532305666, + "learning_rate": 5.899508750327501e-05, + "loss": 0.726, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.6367177088524028, + "learning_rate": 5.875881200614207e-05, + "loss": 0.8739, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.4814612755099226, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.7338, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.5188486822929445, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7792, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.46625320330652303, + "learning_rate": 5.80516544129337e-05, + "loss": 0.697, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.45902185863682476, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6859, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.4638535553430814, + "learning_rate": 5.758162259883867e-05, + "loss": 0.7149, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.48348600053977475, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7693, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.4932746662890735, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.8263, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5570560204822728, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7424, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.4687109667729972, + "learning_rate": 5.664499159372017e-05, + "loss": 0.6849, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5483076820634255, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.8234, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.5445418867646071, + "learning_rate": 5.617841757494762e-05, + "loss": 0.8508, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.4614406941304066, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7635, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.5143174747810663, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.7847, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4994639294282618, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7482, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.48428453212647665, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6998, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5057027434820639, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7062, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.49891374589042337, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.8326, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.5038228377305515, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7358, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.47454180105446425, + "learning_rate": 5.432402360355615e-05, + "loss": 0.7259, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.6162418744380285, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7957, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.3760689487003604, + "learning_rate": 5.386346293357242e-05, + "loss": 0.6411, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.5022764707957805, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6897, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.6651497049557669, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.9318, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.43376355417024504, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6982, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.6474226677331079, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.8946, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.48007278214523885, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7405, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.4374660499808529, + "learning_rate": 5.248926987065417e-05, + "loss": 0.7482, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.517081980205305, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7528, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.48323090918397094, + "learning_rate": 5.203374286747158e-05, + "loss": 0.7315, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.5021286628734111, + "learning_rate": 5.180646201763577e-05, + "loss": 0.8212, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.5141349387177144, + "learning_rate": 5.15795049724435e-05, + "loss": 0.7111, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.5084201956002226, + "learning_rate": 5.135287325678271e-05, + "loss": 0.8196, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.4490843664890864, + "learning_rate": 5.112656839335543e-05, + "loss": 0.7306, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4695671353572929, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7991, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.5271477922335337, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.8185, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4195106319806952, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6767, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.42742675495065485, + "learning_rate": 5.022464783894744e-05, + "loss": 0.7576, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.5216083687371429, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7573, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.6288375895385837, + "learning_rate": 4.977568810302432e-05, + "loss": 0.8401, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4390594149564587, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7256, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.42755673057491517, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.7145, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.42681072223664257, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7017, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.43743370476074006, + "learning_rate": 4.88818300430819e-05, + "loss": 0.7513, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.4737350149845334, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7188, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.6271243129210922, + "learning_rate": 4.843695574177737e-05, + "loss": 0.7849, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.5890111805244727, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7677, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.6277875462939261, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.9239, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5372185883245542, + "learning_rate": 4.777224634018732e-05, + "loss": 0.8173, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.46421961606055084, + "learning_rate": 4.755137637685979e-05, + "loss": 0.7272, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.5514663286606182, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7869, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.5732940160158861, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.8239, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.47141455825534484, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7133, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.42295740322072267, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.7267, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.5371004747077944, + "learning_rate": 4.645234206515171e-05, + "loss": 0.8628, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.45922615123491084, + "learning_rate": 4.623360864173893e-05, + "loss": 0.7993, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4484668906783044, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7552, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.4361054783759581, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6587, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.48593883605117566, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7353, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.5085759460788811, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.786, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.550456404953214, + "learning_rate": 4.514538954847064e-05, + "loss": 0.8363, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.448597836416631, + "learning_rate": 4.492884557078688e-05, + "loss": 0.7263, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.5920039933884523, + "learning_rate": 4.471267160734731e-05, + "loss": 0.8562, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.6058660173901074, + "learning_rate": 4.449686911058992e-05, + "loss": 0.9037, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4854403817693627, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7899, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.4452987540370817, + "learning_rate": 4.406638431438576e-05, + "loss": 0.6692, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4070353886719388, + "learning_rate": 4.385170490729712e-05, + "loss": 0.659, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.46881195103092205, + "learning_rate": 4.36374027515878e-05, + "loss": 0.769, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4511038364938884, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6831, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.5106310960872966, + "learning_rate": 4.320993595120969e-05, + "loss": 0.8039, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.48236286700585296, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.7366, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.5531838732376217, + "learning_rate": 4.278399540155536e-05, + "loss": 0.7126, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.4169564640631747, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7581, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.6070793415565825, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.7351, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.5040425072210082, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7782, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.3836134411498887, + "learning_rate": 4.193673880223339e-05, + "loss": 0.6477, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.49566390695142093, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7286, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.5542461919792804, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.7657, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.47384573394363444, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7194, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.40625182616849714, + "learning_rate": 4.109572403415386e-05, + "loss": 0.6802, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.550120899048214, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7858, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.9658197812369057, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.8226, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.5011003842913042, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6772, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.5464217346303958, + "learning_rate": 4.026104150684835e-05, + "loss": 0.7787, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.514591387424676, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7904, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.41602117726960824, + "learning_rate": 3.984610290059467e-05, + "loss": 0.6735, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5684147563389843, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7678, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.5427250342476214, + "learning_rate": 3.943278094912946e-05, + "loss": 0.8478, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4462442901021536, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7516, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.47720348237429494, + "learning_rate": 3.902108676060937e-05, + "loss": 0.7417, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.5340837883423885, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.748, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.469677944178968, + "learning_rate": 3.861103139944449e-05, + "loss": 0.7223, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.4166461423938719, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7077, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.5014958715161436, + "learning_rate": 3.820262588600074e-05, + "loss": 0.7866, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.4834069293364297, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6782, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.45015960082810896, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.7348, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.6624099916197879, + "learning_rate": 3.759313507817196e-05, + "loss": 0.9578, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.4489711710544979, + "learning_rate": 3.739080826174498e-05, + "loss": 0.7236, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.45833194341547684, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7572, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.4564155732637293, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.7495, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.5571210878690318, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7802, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.5350746748027275, + "learning_rate": 3.658572115866541e-05, + "loss": 0.8594, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.47110540715206695, + "learning_rate": 3.638551118512089e-05, + "loss": 0.7354, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.40285950217554056, + "learning_rate": 3.618572862711247e-05, + "loss": 0.6676, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.47110131286432433, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7773, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.6065167854178469, + "learning_rate": 3.578745112405083e-05, + "loss": 0.7721, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.47817847440544814, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7187, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.5278482236080575, + "learning_rate": 3.539089935331294e-05, + "loss": 0.8376, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4923776927714505, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6548, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.4171682018933475, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.6879, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.5248488852388212, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7632, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.4982641825250908, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.7282, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.5664112883196568, + "learning_rate": 3.440713983000601e-05, + "loss": 0.8043, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.38128669321258, + "learning_rate": 3.421170477595419e-05, + "loss": 0.6371, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.43300831689783975, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7134, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.5295026247935329, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.6705, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5457004272916515, + "learning_rate": 3.362805697728145e-05, + "loss": 0.8528, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.48791989857955453, + "learning_rate": 3.34343978560367e-05, + "loss": 0.8148, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.5319220887225498, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7641, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.48648570106610073, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.7222, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.4097223911261753, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6958, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.45502173920914035, + "learning_rate": 3.266424677350346e-05, + "loss": 0.6441, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.487560468625458, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7631, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.6753084605502865, + "learning_rate": 3.228188057393895e-05, + "loss": 0.7528, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.634429477646962, + "learning_rate": 3.209137931341143e-05, + "loss": 0.8167, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.6032494837975706, + "learning_rate": 3.190133432000252e-05, + "loss": 0.924, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.5320878056424246, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.8351, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.5302958373609717, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.7477, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5154687813376203, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.767, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.4194848238512301, + "learning_rate": 3.114574250902558e-05, + "loss": 0.7517, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.5657590494786042, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7338, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.46394036128855876, + "learning_rate": 3.077071725875116e-05, + "loss": 0.6656, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.6585457520430493, + "learning_rate": 3.058390171511196e-05, + "loss": 0.7434, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.45631314099479964, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.6775, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.5670975321636078, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7364, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.4286433402915187, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.665, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.42347337598355206, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7033, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.6688146671083809, + "learning_rate": 2.9656844925013637e-05, + "loss": 1.0517, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.43655168283507745, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6696, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.4264804446317368, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7025, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.47865370372653143, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7164, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.43743867825112825, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.6818, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.5586615885344863, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7692, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.41130995308389967, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.6293, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.615142734187489, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7167, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.47711098917288874, + "learning_rate": 2.819819423336775e-05, + "loss": 0.7416, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4113701672329664, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6607, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.5226142070356697, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.7642, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4994550804967028, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7138, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.4357954776441535, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.7087, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.5880842155071231, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.8285, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.4636436847203281, + "learning_rate": 2.712443353799984e-05, + "loss": 0.7134, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.48765809255480125, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.7824, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.6074494031177282, + "learning_rate": 2.677041764010988e-05, + "loss": 0.79, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5426320419053459, + "learning_rate": 2.659414712405398e-05, + "loss": 0.8803, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.43179728700368436, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.6927, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4870535985635999, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7859, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.5152243606756598, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.7925, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.4942498742428769, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6532, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.49471572909917017, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.7355, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.515602610898312, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.8216, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.48759582476970237, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.7691, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.5319259723239352, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6913, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.5226626351276598, + "learning_rate": 2.503004759861258e-05, + "loss": 0.7946, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4222056092368926, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6735, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.5001031163121298, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.6677, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.4506197692500532, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6784, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.5095624169639343, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.8214, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.5021497321557922, + "learning_rate": 2.417867893002387e-05, + "loss": 0.79, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.5481051893245936, + "learning_rate": 2.400992893100822e-05, + "loss": 0.6788, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.6346051133611106, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.8301, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.5582374326562646, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.78, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.48300829039305565, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.8242, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.9117293868139854, + "learning_rate": 2.334004587234717e-05, + "loss": 0.8505, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.39984004320142186, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.5946, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.4204587040766119, + "learning_rate": 2.300819024631603e-05, + "loss": 0.7047, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.42752357682290293, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6708, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.51859445846189, + "learning_rate": 2.26784037992395e-05, + "loss": 0.8237, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.6608745813205704, + "learning_rate": 2.251428928971102e-05, + "loss": 0.9515, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.46954238976179, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.7997, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.5187667828336188, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.748, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.5233972597576384, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.6623, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.46699149175851185, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6952, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.5146291467073221, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.7285, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.41233758034817963, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6995, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.45338313191059965, + "learning_rate": 2.138012622361689e-05, + "loss": 0.7164, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.6167369290491044, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.7874, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.4608400987280651, + "learning_rate": 2.106081749751897e-05, + "loss": 0.7454, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5012559421002032, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7885, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.4750785039809705, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.7909, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.45985630757470286, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6701, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.4270953884966443, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.731, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.4461302187521474, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6372, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.5341156075049326, + "learning_rate": 2.011565445123711e-05, + "loss": 0.8135, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.473636341286951, + "learning_rate": 1.995999968955641e-05, + "loss": 0.7409, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.5258329610105286, + "learning_rate": 1.980488270378612e-05, + "loss": 0.7215, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5026266818314066, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6262, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.5681522310035081, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.7847, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.5040600121056366, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.727, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.42975482071747706, + "learning_rate": 1.918981330958678e-05, + "loss": 0.6897, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.39088474908116566, + "learning_rate": 1.903740076395151e-05, + "loss": 0.7154, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.63331093049371, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.7877, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.5440867686986005, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7196, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.43657119763585545, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.6968, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.43726062031161556, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6916, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.49697995524720795, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.7325, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.642165149449136, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.8114, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.44671659843499995, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.62, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4750374625031183, + "learning_rate": 1.783776873795994e-05, + "loss": 0.7681, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.5677882577982228, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.6878, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.46059699211572874, + "learning_rate": 1.754336106761927e-05, + "loss": 0.8066, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.4889780511296242, + "learning_rate": 1.739698775823442e-05, + "loss": 0.6568, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.538242116224818, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7828, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 1.7775886069499676, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.7322, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.522496704085688, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7564, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.5101087553943181, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.8344, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.4908868315801233, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.7016, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.5870671112428668, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.789, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.41822305107363533, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6779, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.5044443801201322, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.7608, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4144813264330897, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6744, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.4945295553903849, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.7556, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.3674677670770876, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.661, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.4172182450550059, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.6737, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.5509059310722004, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.8365, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.7240049071896496, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.898, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.43993964975766325, + "learning_rate": 1.526852950422226e-05, + "loss": 0.7361, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.5558844401182051, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.8276, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4327273912279101, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7568, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.4133468213755012, + "learning_rate": 1.485810737340767e-05, + "loss": 0.7052, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.495577941601173, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.8321, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.4883198956140364, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.7206, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.4977080357466427, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.7019, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.4608703927284041, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.697, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.5371185255152909, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7261, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.4258325901526125, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.7144, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4440874414866273, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6816, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.6526896713751453, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.7963, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.47426110043298514, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.7609, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.39054648135296366, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.6764, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.42222606460520595, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6633, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.6289629979808175, + "learning_rate": 1.326814704364262e-05, + "loss": 0.7406, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.4661898031533964, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.7252, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.4896951405228761, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.7309, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.5683741886870963, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7927, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.4927990120306898, + "learning_rate": 1.275673273546758e-05, + "loss": 0.7427, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.46204624575369263, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6762, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.5224452763300297, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.7033, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4839244360898422, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7519, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.48478311455515977, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.7926, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.5675322597242162, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7476, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.5558835033517052, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.788, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.5021518913226307, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7328, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.7089194156298404, + "learning_rate": 1.176209418012495e-05, + "loss": 0.7407, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.44794808070935765, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7134, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.5163707520447636, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.8099, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.48425426097146074, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6852, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.5636073280069763, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.7657, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.6161920215418742, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.8373, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.6067370788621518, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.6843, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.49878892018775045, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7251, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.5538931343889086, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.7267, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.5634469383423061, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.682, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.6756498171458365, + "learning_rate": 1.057219974130903e-05, + "loss": 0.8469, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5069613327102218, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7163, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.5896151746440772, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.8184, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4194288233770586, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6887, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.47883310643741056, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.6584, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5616475742536037, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7524, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.43184922997628633, + "learning_rate": 9.887052838721322e-06, + "loss": 0.6764, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.5000781498918973, + "learning_rate": 9.774976338718677e-06, + "loss": 0.8025, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.685194137409394, + "learning_rate": 9.663506046162985e-06, + "loss": 0.6676, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.5853873908548777, + "learning_rate": 9.552642710005299e-06, + "loss": 0.8585, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.42778937703206454, + "learning_rate": 9.44238707511862e-06, + "loss": 0.7184, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.47639955095676556, + "learning_rate": 9.332739882292752e-06, + "loss": 0.706, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.618402241111424, + "learning_rate": 9.22370186822965e-06, + "loss": 0.7574, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.44633938289465214, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6354, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.5875385436189163, + "learning_rate": 9.0074563027294e-06, + "loss": 0.8067, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.5605745440514615, + "learning_rate": 8.900250204211514e-06, + "loss": 0.8492, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.4730052711787034, + "learning_rate": 8.79365619028507e-06, + "loss": 0.7172, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5580800303772229, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7554, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.4254970757930749, + "learning_rate": 8.582307276841462e-06, + "loss": 0.585, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.45978322037020936, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7118, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.5907117314407938, + "learning_rate": 8.37341524246672e-06, + "loss": 0.7398, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4412387518837164, + "learning_rate": 8.269892311900696e-06, + "loss": 0.704, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.4995045790717866, + "learning_rate": 8.166985701199582e-06, + "loss": 0.7067, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.5051232470835921, + "learning_rate": 8.064696101776358e-06, + "loss": 0.8006, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.3971957363970268, + "learning_rate": 7.963024200898462e-06, + "loss": 0.6495, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.49043589782127034, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6654, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.5428837478231188, + "learning_rate": 7.761536223092458e-06, + "loss": 0.6855, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.5282569591654496, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7237, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.5818019327247147, + "learning_rate": 7.562527182833978e-06, + "loss": 0.7772, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.47712081603654866, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7363, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.34622702827014823, + "learning_rate": 7.366002428553153e-06, + "loss": 0.6029, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.6092477978535284, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6862, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.5636533235759575, + "learning_rate": 7.171967241914224e-06, + "loss": 0.7414, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.7189132278455704, + "learning_rate": 7.07588486868922e-06, + "loss": 0.8008, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.4069774053525663, + "learning_rate": 6.980426837673437e-06, + "loss": 0.6373, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.41969446413223566, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6691, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.43341089210286354, + "learning_rate": 6.791386363539065e-06, + "loss": 0.7241, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4763150523225858, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7293, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.5243786982758026, + "learning_rate": 6.604850900032955e-06, + "loss": 0.7858, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.6383878187859945, + "learning_rate": 6.512524116523633e-06, + "loss": 0.8287, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.47023598685149814, + "learning_rate": 6.420825460353974e-06, + "loss": 0.784, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4655342507169999, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6534, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.5541439582539616, + "learning_rate": 6.239314990243339e-06, + "loss": 0.8094, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.5354696327059384, + "learning_rate": 6.149504395842087e-06, + "loss": 0.7385, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.6277838369350939, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.767, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.41775621115792394, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6789, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.4706322350516361, + "learning_rate": 5.883858403607967e-06, + "loss": 0.6697, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5686920934875289, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.8155, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.4840851706332836, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.7131, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.38741107870169045, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6598, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.5555666846750233, + "learning_rate": 5.538519351897575e-06, + "loss": 0.7491, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4618925596656953, + "learning_rate": 5.453769828241872e-06, + "loss": 0.7333, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.5138487900812424, + "learning_rate": 5.369655545525909e-06, + "loss": 0.762, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.6272940777747218, + "learning_rate": 5.286177068899989e-06, + "loss": 0.7763, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.5078429540376489, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.7963, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4946979306301698, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7682, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.5182946181907617, + "learning_rate": 5.039562062965508e-06, + "loss": 0.6991, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.5235203112797608, + "learning_rate": 4.95863237670956e-06, + "loss": 0.8014, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.4425756060303903, + "learning_rate": 4.87834125814235e-06, + "loss": 0.6503, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.6513597876716953, + "learning_rate": 4.798689246727006e-06, + "loss": 0.9007, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.8321938932815445, + "learning_rate": 4.719676877632639e-06, + "loss": 0.7324, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.46951710995420187, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7557, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.46986659494656524, + "learning_rate": 4.563573185591219e-06, + "loss": 0.6842, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.46907519031176875, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6883, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.5157370874627774, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.7977, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4449648889231507, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.669, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.5722331027119578, + "learning_rate": 4.259064579323302e-06, + "loss": 0.7645, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4162921246583935, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6824, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.4815905410333443, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.7512, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.5050075775858367, + "learning_rate": 4.037435632986786e-06, + "loss": 0.8343, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.456676045657739, + "learning_rate": 3.964848174174541e-06, + "loss": 0.6766, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.453441285772829, + "learning_rate": 3.892905960127546e-06, + "loss": 0.728, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.5350497495986325, + "learning_rate": 3.821609474213983e-06, + "loss": 0.7291, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.49917331824024186, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7508, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.48142429639175954, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.7127, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4142110499243709, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6757, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.5430439435262364, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.6672, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.44330305598763314, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.7225, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.5096423082478944, + "learning_rate": 3.40741737109318e-06, + "loss": 0.7237, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.6415859400900558, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7913, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.5364277875549572, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.8011, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4728954859163424, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6631, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.5739372532945478, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.8542, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.4742345238629844, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.808, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.4364079485028247, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.7793, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5331155865940347, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7316, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.5758016730532783, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.6958, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.5318202745282977, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.753, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.45960030309214356, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6986, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.4748676117771573, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6698, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.49488892162902537, + "learning_rate": 2.649217248223468e-06, + "loss": 0.7741, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.6756885390812936, + "learning_rate": 2.590275647868867e-06, + "loss": 0.9493, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.4292986738342194, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6534, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.46699781607588653, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6717, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.4680392648836856, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.6896, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.48369802762826214, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7021, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.6039238486058927, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.7789, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5113843239607643, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7344, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.4748128856557313, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.7957, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.6193323473187806, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7601, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.5466716178846842, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.7283, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.6398194238066671, + "learning_rate": 2.036919225091827e-06, + "loss": 0.819, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.5169431248383446, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.7962, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.5401491941477569, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7109, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.7791898394432363, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.7562, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.47300366205848676, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6712, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.5138926803184707, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.7554, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.44476488021254323, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.691, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.4243693394099248, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6572, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.5404335045720218, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.7063, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.48037402819702374, + "learning_rate": 1.595161589389449e-06, + "loss": 0.6838, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.4427890539725505, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6735, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.46952013285789856, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.6838, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.46652083777150416, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7435, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.5603154803427823, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.7164, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.43953229098015056, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7354, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.5030118359576425, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.7117, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.6079376020902644, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7948, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.6204696509532744, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.7316, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.35035978662590106, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6629, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.5372213133397009, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.7294, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.5643803187500273, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6886, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.4306253841050723, + "learning_rate": 1.089491988176017e-06, + "loss": 0.751, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 1.0330732280513122, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.671, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.4447517153562386, + "learning_rate": 1.014505010326583e-06, + "loss": 0.7121, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.6492945091666067, + "learning_rate": 9.780089980330642e-07, + "loss": 0.7758, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.5360553457492165, + "learning_rate": 9.421782985976068e-07, + "loss": 0.7489, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4555753036082643, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6926, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.5126847788360619, + "learning_rate": 8.725137967920738e-07, + "loss": 0.7871, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.5762488567433169, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7813, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.5245685500496474, + "learning_rate": 8.055133771652345e-07, + "loss": 0.7394, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.6669227035607159, + "learning_rate": 7.730127636723539e-07, + "loss": 0.8453, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.8007736012426442, + "learning_rate": 7.411788403743237e-07, + "loss": 0.859, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4503205112473484, + "learning_rate": 7.100118211581852e-07, + "loss": 0.7615, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.49124340433023866, + "learning_rate": 6.7951191543012e-07, + "loss": 0.7482, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.42003071858441315, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6472, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.4920302625092041, + "learning_rate": 6.205142596505176e-07, + "loss": 0.7542, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.37614228484179074, + "learning_rate": 5.920169059947411e-07, + "loss": 0.5923, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.8642822948189379, + "learning_rate": 5.64187458615939e-07, + "loss": 0.7638, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.46753756589344825, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7803, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.5012484711267575, + "learning_rate": 5.105330261267916e-07, + "loss": 0.7366, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4652899199121445, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7007, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.4954135796719314, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.7854, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.49258990364938865, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6938, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.7968378755440436, + "learning_rate": 4.112469628438365e-07, + "loss": 0.9773, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.5009259157520817, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6898, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.6449277388507836, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.7816, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.5317943900934, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.8529, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.713439857473323, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.7477, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.4758822624453974, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.8277, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.4926167423651641, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.7139, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.5341949380143953, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.7415, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.426429622925696, + "learning_rate": 2.448018893333681e-07, + "loss": 0.7606, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5305430558327908, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.7863, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.45737929758712137, + "learning_rate": 2.098903854912515e-07, + "loss": 0.7229, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4494166283772884, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6684, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.4768000600089465, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.772, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4988486343729917, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.7593, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.40423663767248535, + "learning_rate": 1.481139151579991e-07, + "loss": 0.669, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.508845487296791, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.8378, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.4633942933703335, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6417, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.476644606850743, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.7603, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.5714997209934206, + "learning_rate": 9.707157531134713e-08, + "loss": 0.8296, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.6631071655175154, + "learning_rate": 8.598886661895788e-08, + "loss": 0.8803, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.48653574690084533, + "learning_rate": 7.557746412468758e-08, + "loss": 0.7311, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4000816322132727, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6838, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.45438122903368056, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6974, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.508389340822278, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7088, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.8140920471901532, + "learning_rate": 4.064624751394242e-08, + "loss": 0.7153, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.5118056867395696, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7829, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.4814940872961647, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.7192, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.456682871619528, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7038, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.6349546519149382, + "learning_rate": 1.646071422083395e-08, + "loss": 0.7399, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.49693359188520736, + "learning_rate": 1.209367398504746e-08, + "loss": 0.7429, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.4505753949008942, + "learning_rate": 8.398436437317969e-09, + "loss": 0.7351, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.5128317699007771, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7282, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.5283093131821548, + "learning_rate": 3.023464202944748e-09, + "loss": 0.7268, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.523943892468989, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.7612, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.46158018682087965, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.6429, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.5046438985591529, + "learning_rate": 0.0, + "loss": 0.7855, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1026379869290496.0, + "train_loss": 0.8078127761363983, + "train_runtime": 18699.9059, + "train_samples_per_second": 1.07, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1026379869290496.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..65a18ac0ddc625e9b82c8f5e53d9256de168fde4 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "q_proj", + "gate_proj", + "k_proj", + "v_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fa0c2839efffa06861cceb07cf9fb3120741d395 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a225dc5d95a6ae1c7bf5588ef1bf4ec64ac15f287f3a3160363872eab388aa1b +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..9f3a8904ce65dcf46592de761d9190b14e4d6f3a --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:256a8d5f058d7d4982bc5b91fb56e78638bba9a1a33d405895391adf2f6adabd +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..227027f4723ce951a6382010afd8937e12b3aede --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 1.002475123335683, + "learning_rate": 5e-05, + "loss": 1.4698, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.8049937709697602, + "learning_rate": 0.0001, + "loss": 1.1835, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.8183093836391889, + "learning_rate": 0.00015000000000000001, + "loss": 1.2271, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.7683921707563225, + "learning_rate": 0.0002, + "loss": 1.1947, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.0510309251197087, + "learning_rate": 0.00019996629653035126, + "loss": 1.2203, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.8134488573383152, + "learning_rate": 0.00019986520883988232, + "loss": 1.0905, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.6526412161282288, + "learning_rate": 0.00019969680506871137, + "loss": 1.0465, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.5204967415290006, + "learning_rate": 0.00019946119873266613, + "loss": 0.9215, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.6511808300296046, + "learning_rate": 0.00019915854864676664, + "loss": 0.988, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.813306426855869, + "learning_rate": 0.00019878905881817252, + "loss": 0.9748, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.6112244073376245, + "learning_rate": 0.00019835297830866826, + "loss": 0.9328, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.6296798323148467, + "learning_rate": 0.00019785060106677818, + "loss": 1.0615, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.5638071334528209, + "learning_rate": 0.00019728226572962473, + "loss": 0.8301, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.7128440290106496, + "learning_rate": 0.0001966483553946637, + "loss": 0.846, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.6380147511290136, + "learning_rate": 0.00019594929736144976, + "loss": 0.9178, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.6283060315082349, + "learning_rate": 0.00019518556284360696, + "loss": 0.991, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.5178639087438065, + "learning_rate": 0.0001943576666511982, + "loss": 0.9272, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.551827741755611, + "learning_rate": 0.0001934661668437073, + "loss": 0.9921, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.4462509568812092, + "learning_rate": 0.0001925116643538684, + "loss": 0.8086, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.6584174654719149, + "learning_rate": 0.00019149480258259533, + "loss": 0.9826, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.5995121455746469, + "learning_rate": 0.00019041626696528503, + "loss": 0.9427, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.5839687699018469, + "learning_rate": 0.0001892767845097864, + "loss": 0.9771, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.5688782708463275, + "learning_rate": 0.00018807712330634642, + "loss": 0.8553, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.550008976055812, + "learning_rate": 0.0001868180920098644, + "loss": 0.9807, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.5891229610132139, + "learning_rate": 0.00018550053929480202, + "loss": 1.07, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.557817217847609, + "learning_rate": 0.00018412535328311814, + "loss": 1.0502, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.4672201450304275, + "learning_rate": 0.0001826934609456129, + "loss": 0.7614, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.583233571588004, + "learning_rate": 0.00018120582747708502, + "loss": 0.8497, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.5361346707919389, + "learning_rate": 0.0001796634556457236, + "loss": 0.9454, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.7578504266330326, + "learning_rate": 0.0001780673851171728, + "loss": 0.9485, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.7930891130981601, + "learning_rate": 0.00017641869175372493, + "loss": 1.0536, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.5180379940487552, + "learning_rate": 0.00017471848688911464, + "loss": 0.9213, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.5701771012145226, + "learning_rate": 0.000172967916579403, + "loss": 0.8956, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.7284617561125063, + "learning_rate": 0.00017116816083045602, + "loss": 0.993, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 2.4134043312534286, + "learning_rate": 0.0001693204328025389, + "loss": 0.9732, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.42496793413863926, + "learning_rate": 0.00016742597799256182, + "loss": 0.7763, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.6619793121099012, + "learning_rate": 0.00016548607339452853, + "loss": 0.9008, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.6522244445478148, + "learning_rate": 0.00016350202663875386, + "loss": 1.0672, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.5462383817752346, + "learning_rate": 0.0001614751751104301, + "loss": 0.9031, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.5676805053548298, + "learning_rate": 0.00015940688504813662, + "loss": 0.8455, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.5240811947165249, + "learning_rate": 0.00015729855062290022, + "loss": 0.902, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.4613906058985293, + "learning_rate": 0.00015515159299842707, + "loss": 0.8615, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.4646253048462273, + "learning_rate": 0.00015296745937313987, + "loss": 0.8244, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.529737159176755, + "learning_rate": 0.00015074762200466556, + "loss": 0.8921, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.4771248270697291, + "learning_rate": 0.00014849357721743168, + "loss": 0.8796, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.6050629894691126, + "learning_rate": 0.00014620684439403962, + "loss": 0.9151, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.7838777188238114, + "learning_rate": 0.0001438889649510956, + "loss": 0.8912, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.6902887239461467, + "learning_rate": 0.00014154150130018866, + "loss": 0.9096, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.5683919970051982, + "learning_rate": 0.00013916603579471705, + "loss": 0.9269, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.5290672853989341, + "learning_rate": 0.000136764169663272, + "loss": 0.8106, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.6170953146475162, + "learning_rate": 0.00013433752193029886, + "loss": 0.965, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.4521329285137652, + "learning_rate": 0.00013188772832476188, + "loss": 0.9227, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.5521583271278084, + "learning_rate": 0.00012941644017754964, + "loss": 0.9496, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.4666459577181258, + "learning_rate": 0.00012692532330836346, + "loss": 0.8592, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.4292954670514068, + "learning_rate": 0.00012441605690283915, + "loss": 0.7974, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.5466551052523965, + "learning_rate": 0.0001218903323806595, + "loss": 0.8403, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.5883302435010964, + "learning_rate": 0.00011934985225541998, + "loss": 0.8502, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.557397470521136, + "learning_rate": 0.00011679632898701649, + "loss": 0.9663, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.5412293057735887, + "learning_rate": 0.00011423148382732853, + "loss": 0.9034, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.489899862903999, + "learning_rate": 0.00011165704565997593, + "loss": 0.9012, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.5260608475920923, + "learning_rate": 0.00010907474983493144, + "loss": 0.8843, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.6241407824627501, + "learning_rate": 0.0001064863369987743, + "loss": 0.8676, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.5395216158918755, + "learning_rate": 0.00010389355192137377, + "loss": 0.8135, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.49240885512518473, + "learning_rate": 0.0001012981423197931, + "loss": 0.8643, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.5511692850840025, + "learning_rate": 9.870185768020693e-05, + "loss": 0.9358, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.5114676398612253, + "learning_rate": 9.610644807862625e-05, + "loss": 0.8654, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.5612846646984437, + "learning_rate": 9.35136630012257e-05, + "loss": 0.9446, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.5697157540640136, + "learning_rate": 9.092525016506858e-05, + "loss": 0.9043, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.6772087664591744, + "learning_rate": 8.83429543400241e-05, + "loss": 0.9843, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.3986979971884232, + "learning_rate": 8.57685161726715e-05, + "loss": 0.7799, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.526004756548793, + "learning_rate": 8.320367101298351e-05, + "loss": 0.8681, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.5035036852178381, + "learning_rate": 8.065014774458003e-05, + "loss": 0.8732, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.6202080509189101, + "learning_rate": 7.810966761934053e-05, + "loss": 0.9516, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.5163561353247864, + "learning_rate": 7.558394309716088e-05, + "loss": 0.8012, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.4611214523140949, + "learning_rate": 7.307467669163655e-05, + "loss": 0.7428, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.4680448157741042, + "learning_rate": 7.058355982245037e-05, + "loss": 0.8145, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.5157699384388195, + "learning_rate": 6.811227167523815e-05, + "loss": 0.8448, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.42118567269139934, + "learning_rate": 6.566247806970119e-05, + "loss": 0.7818, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.4084175987467916, + "learning_rate": 6.323583033672799e-05, + "loss": 0.7473, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.7765002711996135, + "learning_rate": 6.083396420528298e-05, + "loss": 1.0617, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.5141505553150338, + "learning_rate": 5.845849869981137e-05, + "loss": 0.8848, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.4665505106546281, + "learning_rate": 5.611103504890444e-05, + "loss": 0.7238, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.497247057356739, + "learning_rate": 5.379315560596038e-05, + "loss": 0.8234, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.4914760684720225, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.8842, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.5224002630076798, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.7794, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.5024783519544875, + "learning_rate": 4.703254062686017e-05, + "loss": 0.889, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.6938810000753591, + "learning_rate": 4.484840700157295e-05, + "loss": 0.8999, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.47416623333153984, + "learning_rate": 4.270144937709981e-05, + "loss": 0.8048, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.4149860597614193, + "learning_rate": 4.059311495186338e-05, + "loss": 0.8108, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.7010817596406334, + "learning_rate": 3.852482488956992e-05, + "loss": 1.0248, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.6575269786064363, + "learning_rate": 3.649797336124615e-05, + "loss": 0.9302, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.5316026929850952, + "learning_rate": 3.45139266054715e-05, + "loss": 0.9774, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.4776089538625679, + "learning_rate": 3.257402200743821e-05, + "loss": 0.8102, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.5600853475954763, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.8381, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.41374368607395756, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.7665, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.4837638603469293, + "learning_rate": 2.7032083420597e-05, + "loss": 0.8811, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.4972027931391034, + "learning_rate": 2.528151311088537e-05, + "loss": 0.8563, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.4990914932563448, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.8019, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.4319357944618215, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7844, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.5958916752528824, + "learning_rate": 2.03365443542764e-05, + "loss": 1.026, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.5311068727150814, + "learning_rate": 1.879417252291502e-05, + "loss": 0.8616, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.6460726823451469, + "learning_rate": 1.730653905438714e-05, + "loss": 1.0715, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.5343341046453493, + "learning_rate": 1.587464671688187e-05, + "loss": 0.8666, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.4809311266333434, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.8524, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.47321674516129725, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.8301, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.5182733171026215, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.8791, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.4645511558470242, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.891, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.47446677400627907, + "learning_rate": 9.583733034714981e-06, + "loss": 0.8293, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.4163475564341525, + "learning_rate": 8.505197417404687e-06, + "loss": 0.7825, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.49642122885816725, + "learning_rate": 7.488335646131628e-06, + "loss": 0.8895, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.5637361463741167, + "learning_rate": 6.533833156292679e-06, + "loss": 0.7237, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.5863037005074132, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.9806, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.46124574810331515, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.7719, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.5926944784931045, + "learning_rate": 4.050702638550275e-06, + "loss": 0.875, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.5545491147233557, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.8167, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.43671807283873015, + "learning_rate": 2.717734270375272e-06, + "loss": 0.8003, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.6098249666296813, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.8736, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.4601261313002717, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.8029, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.4248790991858984, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.8119, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.5713345517214953, + "learning_rate": 8.41451353233369e-07, + "loss": 0.961, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.42342878395245415, + "learning_rate": 5.388012673338661e-07, + "loss": 0.834, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.4709530957161904, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7841, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.6171273556109618, + "learning_rate": 1.3479116011769767e-07, + "loss": 1.0197, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.4367515608451683, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7927, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.6110106581034457, + "learning_rate": 0.0, + "loss": 0.9813, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 97576377352192.0, + "train_loss": 0.9055721774101257, + "train_runtime": 1848.2224, + "train_samples_per_second": 1.082, + "train_steps_per_second": 0.068 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 97576377352192.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b9aee7ed8abfbef9fe7c30417e210c0d1bcc6e33 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "k_proj", + "o_proj", + "v_proj", + "q_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..27983adf166e44fc657e2e63a687702682d388e2 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46ebc18e44273048ffabcbd6989f0070ad2770ebe3b610b1f6233da3c0473f0e +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..b19207afc66091ffecde940adcdab111c0a2b02e --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53050a5bad14abe47326aa75a3b469ebb320c2737545f12d905bb248e1a3a2e3 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2a2990cea99da5b26aa69d7c9339df54f5d331e6 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,476 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.992, + "eval_steps": 500, + "global_step": 62, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 0.7732680457900044, + "learning_rate": 0.0001, + "loss": 1.3267, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 0.7315180877831058, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 0.5820563140825258, + "learning_rate": 0.0001998629534754574, + "loss": 1.2362, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 1.0746012742648456, + "learning_rate": 0.00019945218953682734, + "loss": 1.1503, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.8470915908655696, + "learning_rate": 0.00019876883405951377, + "loss": 1.0586, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.44503583996845364, + "learning_rate": 0.00019781476007338058, + "loss": 1.0279, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.44536491624905683, + "learning_rate": 0.00019659258262890683, + "loss": 0.8624, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.5470997705620867, + "learning_rate": 0.00019510565162951537, + "loss": 0.9922, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.5609845265919665, + "learning_rate": 0.00019335804264972018, + "loss": 0.9773, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.4590082649410531, + "learning_rate": 0.0001913545457642601, + "loss": 0.9101, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.5114746549361595, + "learning_rate": 0.0001891006524188368, + "loss": 0.979, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.4431229621480573, + "learning_rate": 0.00018660254037844388, + "loss": 0.9336, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.4668034216043737, + "learning_rate": 0.00018386705679454242, + "loss": 1.0725, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.39231345197066647, + "learning_rate": 0.00018090169943749476, + "loss": 0.8184, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.48905103020214946, + "learning_rate": 0.0001777145961456971, + "loss": 0.9652, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.518970830511111, + "learning_rate": 0.00017431448254773944, + "loss": 1.0049, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.47750721567014115, + "learning_rate": 0.00017071067811865476, + "loss": 0.9495, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.36048296396596413, + "learning_rate": 0.00016691306063588583, + "loss": 0.8709, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.4572367034332387, + "learning_rate": 0.00016293203910498376, + "loss": 0.9819, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.3922447630076946, + "learning_rate": 0.00015877852522924732, + "loss": 0.878, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.34455434430176324, + "learning_rate": 0.00015446390350150273, + "loss": 0.8797, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.3481663865777293, + "learning_rate": 0.00015000000000000001, + "loss": 0.8555, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.3951295872344038, + "learning_rate": 0.00014539904997395468, + "loss": 0.9022, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.45425405167930893, + "learning_rate": 0.00014067366430758004, + "loss": 0.9084, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.3888790701288202, + "learning_rate": 0.00013583679495453, + "loss": 0.8691, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.3859572309538235, + "learning_rate": 0.00013090169943749476, + "loss": 0.9427, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.3750342958666849, + "learning_rate": 0.00012588190451025207, + "loss": 0.8988, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.42707945856135066, + "learning_rate": 0.00012079116908177593, + "loss": 0.8242, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.3569583535912753, + "learning_rate": 0.0001156434465040231, + "loss": 0.9102, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.3975920783643425, + "learning_rate": 0.00011045284632676536, + "loss": 0.9073, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.4446733801857913, + "learning_rate": 0.0001052335956242944, + "loss": 0.8722, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.36800471454131656, + "learning_rate": 0.0001, + "loss": 0.8467, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.3721984511543991, + "learning_rate": 9.476640437570562e-05, + "loss": 0.9088, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.38777813205326916, + "learning_rate": 8.954715367323468e-05, + "loss": 0.9306, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.35423324887463065, + "learning_rate": 8.435655349597689e-05, + "loss": 0.8836, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.36569409428500577, + "learning_rate": 7.920883091822408e-05, + "loss": 0.8736, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.4578563744683934, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8879, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.3501501062668216, + "learning_rate": 6.909830056250527e-05, + "loss": 0.7837, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.33086131280733955, + "learning_rate": 6.416320504546997e-05, + "loss": 0.8213, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.45835208970698804, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.9113, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.36550765994451895, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.8105, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.3361874225881034, + "learning_rate": 5.000000000000002e-05, + "loss": 0.8611, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.3722661952123568, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.8403, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.4210787324219839, + "learning_rate": 4.12214747707527e-05, + "loss": 0.857, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.4282075988975985, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.9317, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.43157948987013084, + "learning_rate": 3.308693936411421e-05, + "loss": 0.9615, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.38491521771794995, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.833, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.3171022379562048, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.8299, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.4273540932961301, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.8365, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.3909218340959966, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.9118, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.44627480284108617, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.9798, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.37564635907441385, + "learning_rate": 1.339745962155613e-05, + "loss": 0.8687, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.3964836458446199, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.8678, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.3335011216773086, + "learning_rate": 8.645454235739903e-06, + "loss": 0.8725, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.353338514931596, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.8435, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.40299785540917815, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.8651, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.3805449892675675, + "learning_rate": 3.40741737109318e-06, + "loss": 0.8376, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.354414880275706, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.822, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.40618551125903773, + "learning_rate": 1.231165940486234e-06, + "loss": 0.8489, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.340197098413167, + "learning_rate": 5.478104631726711e-07, + "loss": 0.8971, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.30969710317886306, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.8196, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.41364963711720143, + "learning_rate": 0.0, + "loss": 0.9221, + "step": 62 + }, + { + "epoch": 0.992, + "step": 62, + "total_flos": 142969741377536.0, + "train_loss": 0.9199699201891499, + "train_runtime": 1832.8279, + "train_samples_per_second": 1.091, + "train_steps_per_second": 0.034 + } + ], + "logging_steps": 1.0, + "max_steps": 62, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 142969741377536.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b400cc756ab92d6dd0bbdd0ee89424d3f3eda6bd --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "up_proj", + "down_proj", + "o_proj", + "q_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8bc85224d0cd7e560b907506e636b2b16b5e3b17 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca558bafced7c7b27a9208da4ee3181a6b8d61c75e762ec47d008fe6fbd2140c +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..a78cbc4c43ab4be89716e7aa80974519654f631b --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc284ccd6a5a914da3aaf3a30045df73f4add33df7f51fb65b6511edb9adabc5 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..37cbc589e91f12d02d4552c4f67eeb5e4d9d585f --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 1.0256564435424111, + "learning_rate": 5e-05, + "loss": 1.4655, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.7552355489285593, + "learning_rate": 0.0001, + "loss": 1.2215, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.6883786739819368, + "learning_rate": 0.00015000000000000001, + "loss": 1.1932, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.7970757164530996, + "learning_rate": 0.0002, + "loss": 1.2414, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 0.900673855041327, + "learning_rate": 0.00019996629653035126, + "loss": 1.1636, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.8148780701606347, + "learning_rate": 0.00019986520883988232, + "loss": 1.0844, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.677259070559955, + "learning_rate": 0.00019969680506871137, + "loss": 0.9484, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.5546823400726815, + "learning_rate": 0.00019946119873266613, + "loss": 0.981, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.8640394420809945, + "learning_rate": 0.00019915854864676664, + "loss": 1.0034, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.7573926449411329, + "learning_rate": 0.00019878905881817252, + "loss": 1.1, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.5785453292954227, + "learning_rate": 0.00019835297830866826, + "loss": 0.8881, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.6166851826434284, + "learning_rate": 0.00019785060106677818, + "loss": 0.9721, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.5915963542639949, + "learning_rate": 0.00019728226572962473, + "loss": 0.9505, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.5694992463916143, + "learning_rate": 0.0001966483553946637, + "loss": 0.9008, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.5813339466623496, + "learning_rate": 0.00019594929736144976, + "loss": 0.9677, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.631903506762042, + "learning_rate": 0.00019518556284360696, + "loss": 1.0503, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.49686139134072715, + "learning_rate": 0.0001943576666511982, + "loss": 0.8793, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.5712952636157388, + "learning_rate": 0.0001934661668437073, + "loss": 0.9202, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.47221550019952907, + "learning_rate": 0.0001925116643538684, + "loss": 0.9442, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.6481338782372316, + "learning_rate": 0.00019149480258259533, + "loss": 0.9982, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.49626606548379415, + "learning_rate": 0.00019041626696528503, + "loss": 0.8958, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.5263065418122417, + "learning_rate": 0.0001892767845097864, + "loss": 0.9311, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.5978993780520426, + "learning_rate": 0.00018807712330634642, + "loss": 0.8855, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.48422608170661724, + "learning_rate": 0.0001868180920098644, + "loss": 0.8992, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.5583000466790079, + "learning_rate": 0.00018550053929480202, + "loss": 0.9376, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.571615352349519, + "learning_rate": 0.00018412535328311814, + "loss": 0.9632, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.4227469990522799, + "learning_rate": 0.0001826934609456129, + "loss": 0.7566, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.4975002354693412, + "learning_rate": 0.00018120582747708502, + "loss": 0.8079, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.5206883816667652, + "learning_rate": 0.0001796634556457236, + "loss": 0.9136, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.6640775128452036, + "learning_rate": 0.0001780673851171728, + "loss": 1.002, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.7499661444791574, + "learning_rate": 0.00017641869175372493, + "loss": 1.0519, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.5344465855257942, + "learning_rate": 0.00017471848688911464, + "loss": 0.8921, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.5469745110497275, + "learning_rate": 0.000172967916579403, + "loss": 0.8649, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.7278110424470223, + "learning_rate": 0.00017116816083045602, + "loss": 1.0113, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.5756491784352804, + "learning_rate": 0.0001693204328025389, + "loss": 0.9774, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.4363257043622499, + "learning_rate": 0.00016742597799256182, + "loss": 0.8408, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.6394076037709749, + "learning_rate": 0.00016548607339452853, + "loss": 0.9368, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.5240474084698316, + "learning_rate": 0.00016350202663875386, + "loss": 0.9945, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.47424846415648575, + "learning_rate": 0.0001614751751104301, + "loss": 0.9055, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.6321195449640729, + "learning_rate": 0.00015940688504813662, + "loss": 0.9368, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.46626876205754336, + "learning_rate": 0.00015729855062290022, + "loss": 0.8828, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.4697338340572203, + "learning_rate": 0.00015515159299842707, + "loss": 0.8327, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.5418381396777817, + "learning_rate": 0.00015296745937313987, + "loss": 0.8599, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.5393492791835172, + "learning_rate": 0.00015074762200466556, + "loss": 0.9383, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.4327656946256297, + "learning_rate": 0.00014849357721743168, + "loss": 0.8041, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.5974899791865946, + "learning_rate": 0.00014620684439403962, + "loss": 0.9177, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.6401617960325086, + "learning_rate": 0.0001438889649510956, + "loss": 0.9027, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.5481131970503021, + "learning_rate": 0.00014154150130018866, + "loss": 0.8235, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.6073544798834368, + "learning_rate": 0.00013916603579471705, + "loss": 0.9103, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.565209541323449, + "learning_rate": 0.000136764169663272, + "loss": 0.7908, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.5448502238275968, + "learning_rate": 0.00013433752193029886, + "loss": 0.8594, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.5523586951983569, + "learning_rate": 0.00013188772832476188, + "loss": 0.9575, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.5250307199317558, + "learning_rate": 0.00012941644017754964, + "loss": 0.9733, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.5532536577531625, + "learning_rate": 0.00012692532330836346, + "loss": 0.9504, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.5052948111209579, + "learning_rate": 0.00012441605690283915, + "loss": 0.8633, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.4805987502443205, + "learning_rate": 0.0001218903323806595, + "loss": 0.8458, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.46115274580823334, + "learning_rate": 0.00011934985225541998, + "loss": 0.899, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.5760045108097972, + "learning_rate": 0.00011679632898701649, + "loss": 1.0322, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.5570620890429253, + "learning_rate": 0.00011423148382732853, + "loss": 0.9192, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.3867470289209327, + "learning_rate": 0.00011165704565997593, + "loss": 0.7405, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.5035270531240534, + "learning_rate": 0.00010907474983493144, + "loss": 0.8517, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.7185476011366976, + "learning_rate": 0.0001064863369987743, + "loss": 1.0731, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.5910265699485117, + "learning_rate": 0.00010389355192137377, + "loss": 0.853, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.5851423007341354, + "learning_rate": 0.0001012981423197931, + "loss": 0.8727, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.7056454049413521, + "learning_rate": 9.870185768020693e-05, + "loss": 0.9447, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.4891741063331434, + "learning_rate": 9.610644807862625e-05, + "loss": 0.8942, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.5732278436397883, + "learning_rate": 9.35136630012257e-05, + "loss": 0.8911, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.485622139957875, + "learning_rate": 9.092525016506858e-05, + "loss": 0.8543, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.6907666024932004, + "learning_rate": 8.83429543400241e-05, + "loss": 0.8492, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.33329903514610504, + "learning_rate": 8.57685161726715e-05, + "loss": 0.6644, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.4918765786267516, + "learning_rate": 8.320367101298351e-05, + "loss": 0.8767, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.46601601831477785, + "learning_rate": 8.065014774458003e-05, + "loss": 0.8073, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.6301470263304556, + "learning_rate": 7.810966761934053e-05, + "loss": 0.9524, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.4644794063278805, + "learning_rate": 7.558394309716088e-05, + "loss": 0.769, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.4642745345431068, + "learning_rate": 7.307467669163655e-05, + "loss": 0.8602, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.5431808435710237, + "learning_rate": 7.058355982245037e-05, + "loss": 0.9022, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.4667366354704192, + "learning_rate": 6.811227167523815e-05, + "loss": 0.8729, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.4262573768737331, + "learning_rate": 6.566247806970119e-05, + "loss": 0.856, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.4153950446060949, + "learning_rate": 6.323583033672799e-05, + "loss": 0.7869, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.7707225868989106, + "learning_rate": 6.083396420528298e-05, + "loss": 1.131, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.4561390581971247, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7539, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.4794829311105941, + "learning_rate": 5.611103504890444e-05, + "loss": 0.786, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.4895576383157025, + "learning_rate": 5.379315560596038e-05, + "loss": 0.8944, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.4273926670520935, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.8019, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.556449141244985, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.8711, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.47015114999902236, + "learning_rate": 4.703254062686017e-05, + "loss": 0.8838, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.5704985671822204, + "learning_rate": 4.484840700157295e-05, + "loss": 0.8481, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.4365452710110725, + "learning_rate": 4.270144937709981e-05, + "loss": 0.8386, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.391305650590876, + "learning_rate": 4.059311495186338e-05, + "loss": 0.8043, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.6824405839379895, + "learning_rate": 3.852482488956992e-05, + "loss": 0.9423, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.660416355913274, + "learning_rate": 3.649797336124615e-05, + "loss": 0.8834, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.6227410059725017, + "learning_rate": 3.45139266054715e-05, + "loss": 0.9125, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.4583331907499313, + "learning_rate": 3.257402200743821e-05, + "loss": 0.8201, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.5392615246175726, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.7673, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.4023904332506835, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.6999, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.48547058856396635, + "learning_rate": 2.7032083420597e-05, + "loss": 0.885, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.4544749425094669, + "learning_rate": 2.528151311088537e-05, + "loss": 0.792, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.48299342465590717, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7639, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.39134288979882637, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7161, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.5137604085244599, + "learning_rate": 2.03365443542764e-05, + "loss": 0.8608, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.47536392660264287, + "learning_rate": 1.879417252291502e-05, + "loss": 0.7838, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.564710086900722, + "learning_rate": 1.730653905438714e-05, + "loss": 0.9152, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.4969437135139688, + "learning_rate": 1.587464671688187e-05, + "loss": 0.8222, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.49254817501894227, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.9312, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.48952543980152713, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.8452, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.4424646360215586, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.8221, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.4175512352633237, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7827, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.5831510931897785, + "learning_rate": 9.583733034714981e-06, + "loss": 0.9001, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.5059003584438967, + "learning_rate": 8.505197417404687e-06, + "loss": 0.8749, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.4736929126239816, + "learning_rate": 7.488335646131628e-06, + "loss": 0.7588, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.430356609626266, + "learning_rate": 6.533833156292679e-06, + "loss": 0.7268, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.46958716530124167, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.8987, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.4162473319543384, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.8112, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.5341610230495429, + "learning_rate": 4.050702638550275e-06, + "loss": 0.9766, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.5720919227278933, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.8654, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.4560340817159988, + "learning_rate": 2.717734270375272e-06, + "loss": 0.776, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.6335819308236421, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.8167, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.43926428679607193, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.8853, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.4925124228161911, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.883, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.41754865671962843, + "learning_rate": 8.41451353233369e-07, + "loss": 0.8502, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.47875102943981157, + "learning_rate": 5.388012673338661e-07, + "loss": 0.9114, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.3997925650898424, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7552, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.6125224289311584, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.9852, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.4513526909585683, + "learning_rate": 3.370346964876036e-08, + "loss": 0.8328, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.5018800693252005, + "learning_rate": 0.0, + "loss": 0.7772, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 102017789034496.0, + "train_loss": 0.8980674777030945, + "train_runtime": 1842.54, + "train_samples_per_second": 1.085, + "train_steps_per_second": 0.068 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 102017789034496.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..00f124d2409ee80e0a03741e688c81d3af18b475 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9832d429f53322da94e68110a0081c2e8aff015b --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26036b497c88ae220df32bcfed67582fd83decba854b1fbb7306af08bba9cc9a +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..f53428b1fc659506742fbd14baa7edaea389476e --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a55781e86d42bff8bcbd7ff9f97ecbb1b2e38408ccabd43d24ee3686edaad85 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a4101fcf05ffb025b481e90ecab1245ff5a0b891 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,476 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.992, + "eval_steps": 500, + "global_step": 62, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 0.782165330544756, + "learning_rate": 0.0001, + "loss": 1.3435, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 0.754070307459071, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 0.5461919719204326, + "learning_rate": 0.0001998629534754574, + "loss": 1.1966, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 0.9835561940313817, + "learning_rate": 0.00019945218953682734, + "loss": 1.114, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.9635137567561632, + "learning_rate": 0.00019876883405951377, + "loss": 1.129, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.45232540275106836, + "learning_rate": 0.00019781476007338058, + "loss": 0.9551, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.46885643899795243, + "learning_rate": 0.00019659258262890683, + "loss": 0.9536, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.47240411115859143, + "learning_rate": 0.00019510565162951537, + "loss": 1.0253, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.469293314652227, + "learning_rate": 0.00019335804264972018, + "loss": 0.9232, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.5104546446757983, + "learning_rate": 0.0001913545457642601, + "loss": 0.985, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.4601532263027663, + "learning_rate": 0.0001891006524188368, + "loss": 0.9291, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.4486806246135983, + "learning_rate": 0.00018660254037844388, + "loss": 0.9031, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.4571343318182727, + "learning_rate": 0.00018386705679454242, + "loss": 0.9608, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.36898115931198266, + "learning_rate": 0.00018090169943749476, + "loss": 0.7977, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.45407015287167707, + "learning_rate": 0.0001777145961456971, + "loss": 0.9692, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.4624428629659114, + "learning_rate": 0.00017431448254773944, + "loss": 0.9795, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.43933391342487677, + "learning_rate": 0.00017071067811865476, + "loss": 0.9406, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.38659300160601445, + "learning_rate": 0.00016691306063588583, + "loss": 0.9119, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.44012941016542223, + "learning_rate": 0.00016293203910498376, + "loss": 0.9715, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.41558944105943874, + "learning_rate": 0.00015877852522924732, + "loss": 0.9331, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.37477800540519907, + "learning_rate": 0.00015446390350150273, + "loss": 0.8592, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.3865244154226343, + "learning_rate": 0.00015000000000000001, + "loss": 0.9036, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.3723977046786962, + "learning_rate": 0.00014539904997395468, + "loss": 0.8628, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.5174938737735406, + "learning_rate": 0.00014067366430758004, + "loss": 0.8709, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.4309191480685511, + "learning_rate": 0.00013583679495453, + "loss": 0.8449, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.3858404886322895, + "learning_rate": 0.00013090169943749476, + "loss": 0.9132, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.3947316248290328, + "learning_rate": 0.00012588190451025207, + "loss": 0.9646, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.3732869809484617, + "learning_rate": 0.00012079116908177593, + "loss": 0.8615, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.40598163636604373, + "learning_rate": 0.0001156434465040231, + "loss": 0.9674, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.3805757812339103, + "learning_rate": 0.00011045284632676536, + "loss": 0.8304, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.43863699893987973, + "learning_rate": 0.0001052335956242944, + "loss": 0.9668, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.4098167304825637, + "learning_rate": 0.0001, + "loss": 0.8626, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.4571546550706454, + "learning_rate": 9.476640437570562e-05, + "loss": 0.9252, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.4084843884323696, + "learning_rate": 8.954715367323468e-05, + "loss": 0.8752, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.3636969358131221, + "learning_rate": 8.435655349597689e-05, + "loss": 0.7612, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.35072568532082526, + "learning_rate": 7.920883091822408e-05, + "loss": 0.847, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.4076963271445678, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8668, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.3794362951488034, + "learning_rate": 6.909830056250527e-05, + "loss": 0.887, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.35306265395704695, + "learning_rate": 6.416320504546997e-05, + "loss": 0.8711, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.4539374345769442, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.9706, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.3460724552816038, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.7806, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.33225221207720107, + "learning_rate": 5.000000000000002e-05, + "loss": 0.8534, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.3773553326234811, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.8824, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.3721320667635631, + "learning_rate": 4.12214747707527e-05, + "loss": 0.8505, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.3952460893667804, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.8821, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.4583596405040786, + "learning_rate": 3.308693936411421e-05, + "loss": 0.9099, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.34318822344159067, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.8046, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.3974864621077287, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.8029, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.33691562279115866, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.7858, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.3468959629694207, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.7976, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.4023248117346016, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.8573, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.3788853906081972, + "learning_rate": 1.339745962155613e-05, + "loss": 0.8905, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.33720185569187455, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.8469, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.3739374025884539, + "learning_rate": 8.645454235739903e-06, + "loss": 0.8505, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.3505596171112871, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.8288, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.3417938856878257, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.822, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.3688386882858889, + "learning_rate": 3.40741737109318e-06, + "loss": 0.9049, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.3665705430510355, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.8364, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.37903546324196835, + "learning_rate": 1.231165940486234e-06, + "loss": 0.864, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.33909260997680446, + "learning_rate": 5.478104631726711e-07, + "loss": 0.8762, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.3213467139172359, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.8468, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.3981209076516244, + "learning_rate": 0.0, + "loss": 0.9223, + "step": 62 + }, + { + "epoch": 0.992, + "step": 62, + "total_flos": 146045366304768.0, + "train_loss": 0.9138678091187631, + "train_runtime": 1818.899, + "train_samples_per_second": 1.1, + "train_steps_per_second": 0.034 + } + ], + "logging_steps": 1.0, + "max_steps": 62, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 146045366304768.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..505d4ac1d30fa33adf5f438cc8f88c29ee45a54c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dfabb4564d1142a817d39e880f869095d8b7fa46 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30a3447fe5b036a4f8feb09dcd886357112019ee0b7e6259eb0b48aeae510bfe +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..fdb0860c5097fc8eda2d1915d07515d5c4e36d7f --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4087778e60c19a0c01403e8aea7fb0ce9d0d3525e435d56f9ea21771a9c2aaa5 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c936d5f48a57435cebbfb7dcbc1584e2d29abfdb --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 1.0332716441410472, + "learning_rate": 5e-05, + "loss": 1.453, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.8514892777847967, + "learning_rate": 0.0001, + "loss": 1.3234, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.6959198723787812, + "learning_rate": 0.00015000000000000001, + "loss": 1.2067, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.8987235917741394, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.1299456614468177, + "learning_rate": 0.00019996629653035126, + "loss": 1.1911, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.793226445782373, + "learning_rate": 0.00019986520883988232, + "loss": 1.1325, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.9253618295353283, + "learning_rate": 0.00019969680506871137, + "loss": 1.0616, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.5422074768572798, + "learning_rate": 0.00019946119873266613, + "loss": 0.9376, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.7037637052231442, + "learning_rate": 0.00019915854864676664, + "loss": 1.1079, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.7361037631831095, + "learning_rate": 0.00019878905881817252, + "loss": 1.0008, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.5450700968763571, + "learning_rate": 0.00019835297830866826, + "loss": 0.8945, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.6663281248179639, + "learning_rate": 0.00019785060106677818, + "loss": 1.0249, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.5780218413581901, + "learning_rate": 0.00019728226572962473, + "loss": 0.9906, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.5939027347752556, + "learning_rate": 0.0001966483553946637, + "loss": 0.8611, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.5385064078264016, + "learning_rate": 0.00019594929736144976, + "loss": 0.9205, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.5649023377388397, + "learning_rate": 0.00019518556284360696, + "loss": 0.9865, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.4899049124834227, + "learning_rate": 0.0001943576666511982, + "loss": 0.9336, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.5134066015087574, + "learning_rate": 0.0001934661668437073, + "loss": 0.8971, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.49291923410925337, + "learning_rate": 0.0001925116643538684, + "loss": 0.865, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.5925651438966584, + "learning_rate": 0.00019149480258259533, + "loss": 0.9769, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.5320287155420453, + "learning_rate": 0.00019041626696528503, + "loss": 0.9529, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.5644447042868125, + "learning_rate": 0.0001892767845097864, + "loss": 0.9704, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.614822477245167, + "learning_rate": 0.00018807712330634642, + "loss": 0.9425, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.49909622212960736, + "learning_rate": 0.0001868180920098644, + "loss": 0.9296, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.5414158089755281, + "learning_rate": 0.00018550053929480202, + "loss": 1.0019, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.6410161194084684, + "learning_rate": 0.00018412535328311814, + "loss": 0.9462, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.5017741156898022, + "learning_rate": 0.0001826934609456129, + "loss": 0.7758, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.5629101098935121, + "learning_rate": 0.00018120582747708502, + "loss": 0.9592, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.7337176164220769, + "learning_rate": 0.0001796634556457236, + "loss": 1.0029, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.6164075654413312, + "learning_rate": 0.0001780673851171728, + "loss": 0.9299, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.7419949695157417, + "learning_rate": 0.00017641869175372493, + "loss": 1.0221, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.5299112205217898, + "learning_rate": 0.00017471848688911464, + "loss": 0.9308, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.5776412989569846, + "learning_rate": 0.000172967916579403, + "loss": 0.9942, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.6313052845956789, + "learning_rate": 0.00017116816083045602, + "loss": 1.0971, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.5003061050107913, + "learning_rate": 0.0001693204328025389, + "loss": 0.8396, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.4591443010727368, + "learning_rate": 0.00016742597799256182, + "loss": 0.7597, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.5911488491446177, + "learning_rate": 0.00016548607339452853, + "loss": 0.8529, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.5989227409817768, + "learning_rate": 0.00016350202663875386, + "loss": 0.9804, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.563063337709486, + "learning_rate": 0.0001614751751104301, + "loss": 0.9114, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.730771230167327, + "learning_rate": 0.00015940688504813662, + "loss": 0.9354, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.4639652152528779, + "learning_rate": 0.00015729855062290022, + "loss": 0.8546, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.46037957857450756, + "learning_rate": 0.00015515159299842707, + "loss": 0.8659, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.5473058205736925, + "learning_rate": 0.00015296745937313987, + "loss": 0.8318, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.46545563704205734, + "learning_rate": 0.00015074762200466556, + "loss": 0.9029, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.46677463624164733, + "learning_rate": 0.00014849357721743168, + "loss": 0.8779, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.6437448823503382, + "learning_rate": 0.00014620684439403962, + "loss": 0.9853, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.5912726223579762, + "learning_rate": 0.0001438889649510956, + "loss": 0.9654, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.5474299182817363, + "learning_rate": 0.00014154150130018866, + "loss": 0.8757, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.6580789970255577, + "learning_rate": 0.00013916603579471705, + "loss": 0.8612, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.5334122921747871, + "learning_rate": 0.000136764169663272, + "loss": 0.8773, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.6562301905218774, + "learning_rate": 0.00013433752193029886, + "loss": 0.9756, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.5101667749739541, + "learning_rate": 0.00013188772832476188, + "loss": 0.928, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.5565479940344975, + "learning_rate": 0.00012941644017754964, + "loss": 1.0204, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.5638515277819136, + "learning_rate": 0.00012692532330836346, + "loss": 0.9225, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.43460188063783134, + "learning_rate": 0.00012441605690283915, + "loss": 0.7718, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.4554073002029318, + "learning_rate": 0.0001218903323806595, + "loss": 0.8008, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.5208371009471047, + "learning_rate": 0.00011934985225541998, + "loss": 0.8889, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.5593229063063273, + "learning_rate": 0.00011679632898701649, + "loss": 0.9384, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.54960098782361, + "learning_rate": 0.00011423148382732853, + "loss": 0.9036, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.48884855977815644, + "learning_rate": 0.00011165704565997593, + "loss": 0.8171, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.46435529530768666, + "learning_rate": 0.00010907474983493144, + "loss": 0.8391, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.6628142660135761, + "learning_rate": 0.0001064863369987743, + "loss": 0.8476, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.6122798204602851, + "learning_rate": 0.00010389355192137377, + "loss": 0.88, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.49046332010661803, + "learning_rate": 0.0001012981423197931, + "loss": 0.8288, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.6136093202470114, + "learning_rate": 9.870185768020693e-05, + "loss": 1.0322, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.5382687359600139, + "learning_rate": 9.610644807862625e-05, + "loss": 0.9218, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.5377382533156001, + "learning_rate": 9.35136630012257e-05, + "loss": 1.024, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.5201987253132132, + "learning_rate": 9.092525016506858e-05, + "loss": 0.9424, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.7127669438364177, + "learning_rate": 8.83429543400241e-05, + "loss": 0.8807, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.37575821411555355, + "learning_rate": 8.57685161726715e-05, + "loss": 0.7847, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.49383393324752206, + "learning_rate": 8.320367101298351e-05, + "loss": 0.8894, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.523020599533793, + "learning_rate": 8.065014774458003e-05, + "loss": 0.8799, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.5727461338955292, + "learning_rate": 7.810966761934053e-05, + "loss": 0.9956, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.5591412806115048, + "learning_rate": 7.558394309716088e-05, + "loss": 0.8745, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.4893113572410945, + "learning_rate": 7.307467669163655e-05, + "loss": 0.8439, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.4501928716993276, + "learning_rate": 7.058355982245037e-05, + "loss": 0.8346, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.4411794345564976, + "learning_rate": 6.811227167523815e-05, + "loss": 0.7823, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.4815928556011765, + "learning_rate": 6.566247806970119e-05, + "loss": 0.9199, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.45213927151139305, + "learning_rate": 6.323583033672799e-05, + "loss": 0.7929, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.8202385268416413, + "learning_rate": 6.083396420528298e-05, + "loss": 1.1149, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.43244185742587343, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7356, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.4645090449706881, + "learning_rate": 5.611103504890444e-05, + "loss": 0.8437, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.43524775119682985, + "learning_rate": 5.379315560596038e-05, + "loss": 0.8441, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.4929646970860245, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.8442, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.5681369158524896, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.939, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.49771416915403976, + "learning_rate": 4.703254062686017e-05, + "loss": 0.8819, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.6441986856235902, + "learning_rate": 4.484840700157295e-05, + "loss": 0.9392, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.41313256987360597, + "learning_rate": 4.270144937709981e-05, + "loss": 0.7875, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.4259263131590564, + "learning_rate": 4.059311495186338e-05, + "loss": 0.808, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.6227537569017948, + "learning_rate": 3.852482488956992e-05, + "loss": 1.0245, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.588509982783627, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7623, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.5622760139848599, + "learning_rate": 3.45139266054715e-05, + "loss": 0.9896, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.48763232574296367, + "learning_rate": 3.257402200743821e-05, + "loss": 0.8559, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.4667304015024215, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.8267, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.4261148874220418, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.8342, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.4603737051292125, + "learning_rate": 2.7032083420597e-05, + "loss": 0.8217, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.4633578614777036, + "learning_rate": 2.528151311088537e-05, + "loss": 0.8426, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.4740449779874253, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7619, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.5256639252782974, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.9056, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.5471215610899327, + "learning_rate": 2.03365443542764e-05, + "loss": 0.9285, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.535860622346642, + "learning_rate": 1.879417252291502e-05, + "loss": 0.8971, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.6307994094530405, + "learning_rate": 1.730653905438714e-05, + "loss": 1.0222, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.5641556223888017, + "learning_rate": 1.587464671688187e-05, + "loss": 0.9874, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.597451406822819, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.8467, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.47083496691750065, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.8349, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.5075295600997802, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.9282, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.5065339725265197, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7942, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.5116839630404477, + "learning_rate": 9.583733034714981e-06, + "loss": 0.8678, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.5642318524566664, + "learning_rate": 8.505197417404687e-06, + "loss": 0.8711, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.5469422927536324, + "learning_rate": 7.488335646131628e-06, + "loss": 0.8471, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.42867885430913916, + "learning_rate": 6.533833156292679e-06, + "loss": 0.7824, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.4815338643224403, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.8101, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.4968549860424203, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.7646, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.4976065822884428, + "learning_rate": 4.050702638550275e-06, + "loss": 0.8364, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.5250560990287962, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.8047, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.4690379102326673, + "learning_rate": 2.717734270375272e-06, + "loss": 0.8484, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.5793237985664929, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.9048, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.4269403095096772, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.7737, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.46070532681301707, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.774, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.46255760757400677, + "learning_rate": 8.41451353233369e-07, + "loss": 0.9062, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.4638669355270497, + "learning_rate": 5.388012673338661e-07, + "loss": 0.8366, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.39893326777206606, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7951, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.6063856963402054, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.8833, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.48929628653621665, + "learning_rate": 3.370346964876036e-08, + "loss": 0.894, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.5869716432080327, + "learning_rate": 0.0, + "loss": 0.9297, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 101132938903552.0, + "train_loss": 0.9124656667709351, + "train_runtime": 1847.9032, + "train_samples_per_second": 1.082, + "train_steps_per_second": 0.068 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 101132938903552.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d454d3aa3d035dd52481edda13024c9006e2ae5e --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..689a602f2a626b38c8fd9f3480f1f0369532ccc8 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1193313d43dcfadaebfbe1f934c944b44e81d1cc55efe9e59b9caba49b9348dd +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..d85cb3c0d0899ced2143e96dd10db65277bb979e --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b963a4ff177f9c1ffb786e646e68709ebae25e6b73263d2b0fb7f24eb75bdaba +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3bbb77963f32d428ddecf087dc5b022f97043555 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,476 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.992, + "eval_steps": 500, + "global_step": 62, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 0.8115376946117644, + "learning_rate": 0.0001, + "loss": 1.3882, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 0.7379596036373107, + "learning_rate": 0.0002, + "loss": 1.2875, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 0.6516716706764099, + "learning_rate": 0.0001998629534754574, + "loss": 1.2321, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 0.8045523995759459, + "learning_rate": 0.00019945218953682734, + "loss": 1.1171, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.9748341310102832, + "learning_rate": 0.00019876883405951377, + "loss": 1.1203, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.43262498003397853, + "learning_rate": 0.00019781476007338058, + "loss": 0.9855, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.43710555081674196, + "learning_rate": 0.00019659258262890683, + "loss": 0.9553, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.4299584668831962, + "learning_rate": 0.00019510565162951537, + "loss": 0.9737, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.40784014191864315, + "learning_rate": 0.00019335804264972018, + "loss": 0.9345, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.4890797924633881, + "learning_rate": 0.0001913545457642601, + "loss": 0.942, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.47290779258642107, + "learning_rate": 0.0001891006524188368, + "loss": 0.9837, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.45166657752550265, + "learning_rate": 0.00018660254037844388, + "loss": 0.9566, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.4802427221669656, + "learning_rate": 0.00018386705679454242, + "loss": 0.998, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.39526045068707355, + "learning_rate": 0.00018090169943749476, + "loss": 0.8799, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.4410743595660131, + "learning_rate": 0.0001777145961456971, + "loss": 0.9714, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.46620032478721474, + "learning_rate": 0.00017431448254773944, + "loss": 0.9765, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.47112777607913847, + "learning_rate": 0.00017071067811865476, + "loss": 1.0516, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.34959905040630146, + "learning_rate": 0.00016691306063588583, + "loss": 0.8071, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.42026296264872304, + "learning_rate": 0.00016293203910498376, + "loss": 0.9082, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.46200194126278427, + "learning_rate": 0.00015877852522924732, + "loss": 0.9288, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.34456840268198746, + "learning_rate": 0.00015446390350150273, + "loss": 0.858, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.3515579855408626, + "learning_rate": 0.00015000000000000001, + "loss": 0.8691, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.38030364310331255, + "learning_rate": 0.00014539904997395468, + "loss": 0.9363, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.42311870140610947, + "learning_rate": 0.00014067366430758004, + "loss": 0.9232, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.39866994027687425, + "learning_rate": 0.00013583679495453, + "loss": 0.8756, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.41257440907704074, + "learning_rate": 0.00013090169943749476, + "loss": 0.956, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.40270214195226256, + "learning_rate": 0.00012588190451025207, + "loss": 0.9732, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.3252586417943798, + "learning_rate": 0.00012079116908177593, + "loss": 0.7926, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.40219994775926615, + "learning_rate": 0.0001156434465040231, + "loss": 0.9141, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.4617984370404977, + "learning_rate": 0.00011045284632676536, + "loss": 0.8637, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.4517632713189801, + "learning_rate": 0.0001052335956242944, + "loss": 0.84, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.39591319078574316, + "learning_rate": 0.0001, + "loss": 0.8572, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.43145688796857856, + "learning_rate": 9.476640437570562e-05, + "loss": 0.9818, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.48356961743265936, + "learning_rate": 8.954715367323468e-05, + "loss": 0.9811, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.4235867054801886, + "learning_rate": 8.435655349597689e-05, + "loss": 0.8341, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.3838193770356642, + "learning_rate": 7.920883091822408e-05, + "loss": 0.8893, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.4037097044310178, + "learning_rate": 7.411809548974792e-05, + "loss": 0.9402, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.35482227410871914, + "learning_rate": 6.909830056250527e-05, + "loss": 0.8438, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.33458547670959093, + "learning_rate": 6.416320504546997e-05, + "loss": 0.8558, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.48202841998230445, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.9616, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.33135359355210775, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.7899, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.3655941362699932, + "learning_rate": 5.000000000000002e-05, + "loss": 0.8469, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.4416572733780382, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.9167, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.3963306861215452, + "learning_rate": 4.12214747707527e-05, + "loss": 0.8726, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.403752781263034, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.9247, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.4429767955337501, + "learning_rate": 3.308693936411421e-05, + "loss": 0.8834, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.3854408763441377, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.8498, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.32318832922926677, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.8347, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.35118024180739127, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.8105, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.4052987291396471, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.9288, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.44147568315067864, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.9733, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.39868663009902816, + "learning_rate": 1.339745962155613e-05, + "loss": 0.9249, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.3512200707436714, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.8878, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.36779032235091047, + "learning_rate": 8.645454235739903e-06, + "loss": 0.8425, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.4039621504635964, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.8691, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.6982928092645577, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.8083, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.3586377109842292, + "learning_rate": 3.40741737109318e-06, + "loss": 0.8086, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.3752504895539116, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.8385, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.3658655707367418, + "learning_rate": 1.231165940486234e-06, + "loss": 0.847, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.3096899200162482, + "learning_rate": 5.478104631726711e-07, + "loss": 0.8499, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.3203032261580068, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.8257, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.40910150612194435, + "learning_rate": 0.0, + "loss": 0.9072, + "step": 62 + }, + { + "epoch": 0.992, + "step": 62, + "total_flos": 147227655536640.0, + "train_loss": 0.9255667328834534, + "train_runtime": 1838.2528, + "train_samples_per_second": 1.088, + "train_steps_per_second": 0.034 + } + ], + "logging_steps": 1.0, + "max_steps": 62, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 147227655536640.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..54aea61e623644bcee43f27e87281877e749b36a --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "gate_proj", + "down_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f5e3ef0031d3e669ba9381578250bc72c67aaed --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7459044d10fe89f47d2625e747c6f65abe2e2949be54a8120cbebc9fa49f5b99 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..077c8ef7f96e606115c6a8bb9c09fc18fce0ec9e --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69001178fa6f4f8a27fe76d6457aff758561b0096c67de38dbde021927b77345 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a1ab08c470f54e229f81d15dd86f84324317d096 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.7478160203624254, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.2281, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.0422662241870386, + "learning_rate": 7.017543859649123e-06, + "loss": 1.1893, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 0.898821327534039, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.2598, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.9404599215806203, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.3365, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.8271399266416969, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.4094, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.7523511046413769, + "learning_rate": 2.105263157894737e-05, + "loss": 1.1267, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.8181154938413164, + "learning_rate": 2.456140350877193e-05, + "loss": 1.2057, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.6888887133244778, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.2002, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.6372000597807975, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1094, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.8311538549618501, + "learning_rate": 3.508771929824561e-05, + "loss": 1.2097, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.7787950909551065, + "learning_rate": 3.859649122807018e-05, + "loss": 1.0411, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8628953898774414, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1819, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.7207552828246774, + "learning_rate": 4.56140350877193e-05, + "loss": 1.0764, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.8317473561875744, + "learning_rate": 4.912280701754386e-05, + "loss": 1.1077, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 1.0880818053614876, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2038, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.7101685723179632, + "learning_rate": 5.6140350877192984e-05, + "loss": 1.038, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.7977818474566017, + "learning_rate": 5.9649122807017544e-05, + "loss": 1.0586, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6964640262892129, + "learning_rate": 6.31578947368421e-05, + "loss": 1.017, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.6772461845492532, + "learning_rate": 6.666666666666667e-05, + "loss": 0.9125, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.8132239433117805, + "learning_rate": 7.017543859649122e-05, + "loss": 0.9926, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7794313373761221, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1181, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.5293596087693633, + "learning_rate": 7.719298245614036e-05, + "loss": 0.8762, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.7665557573044675, + "learning_rate": 8.070175438596491e-05, + "loss": 0.8919, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6499984197559702, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9798, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.5797362185097485, + "learning_rate": 8.771929824561403e-05, + "loss": 0.9479, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.6285858322846415, + "learning_rate": 9.12280701754386e-05, + "loss": 0.7771, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.685266749813973, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9645, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.6392698438243327, + "learning_rate": 9.824561403508771e-05, + "loss": 0.9325, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.663102762162956, + "learning_rate": 0.0001017543859649123, + "loss": 0.9508, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.5064758958958117, + "learning_rate": 0.00010526315789473685, + "loss": 0.8695, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.5992180434134455, + "learning_rate": 0.00010877192982456141, + "loss": 0.9056, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.5062578401046487, + "learning_rate": 0.00011228070175438597, + "loss": 0.8436, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.7926225185864879, + "learning_rate": 0.00011578947368421053, + "loss": 0.9324, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.7470260490190631, + "learning_rate": 0.00011929824561403509, + "loss": 1.0706, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.5382330100027091, + "learning_rate": 0.00012280701754385965, + "loss": 0.8687, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6553871123688179, + "learning_rate": 0.0001263157894736842, + "loss": 0.8886, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.7528426065938322, + "learning_rate": 0.0001298245614035088, + "loss": 0.9648, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.5463385508806565, + "learning_rate": 0.00013333333333333334, + "loss": 0.8942, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6764033329582932, + "learning_rate": 0.0001368421052631579, + "loss": 0.9313, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.45754262448789773, + "learning_rate": 0.00014035087719298245, + "loss": 0.8133, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.5362275391995662, + "learning_rate": 0.00014385964912280703, + "loss": 0.885, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6570143061405205, + "learning_rate": 0.00014736842105263158, + "loss": 0.9815, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.680711661682576, + "learning_rate": 0.00015087719298245616, + "loss": 0.9644, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.6324241814563605, + "learning_rate": 0.0001543859649122807, + "loss": 0.9316, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.5298714490811897, + "learning_rate": 0.00015789473684210527, + "loss": 0.8209, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.5873277960396065, + "learning_rate": 0.00016140350877192982, + "loss": 0.8341, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.7213114786508069, + "learning_rate": 0.0001649122807017544, + "loss": 1.0156, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6916436592160714, + "learning_rate": 0.00016842105263157895, + "loss": 0.9357, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.7716424015946091, + "learning_rate": 0.00017192982456140353, + "loss": 1.006, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.44992446044036816, + "learning_rate": 0.00017543859649122806, + "loss": 0.7826, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.7427382532262721, + "learning_rate": 0.00017894736842105264, + "loss": 1.041, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.7401724097633609, + "learning_rate": 0.0001824561403508772, + "loss": 0.9546, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.5319230954044463, + "learning_rate": 0.00018596491228070177, + "loss": 0.8394, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5148420974205532, + "learning_rate": 0.00018947368421052632, + "loss": 0.8594, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.6043264083005266, + "learning_rate": 0.00019298245614035088, + "loss": 0.8999, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.4758608899930653, + "learning_rate": 0.00019649122807017543, + "loss": 0.8195, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.5043119387684599, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.6030587988789778, + "learning_rate": 0.00019999985069241055, + "loss": 0.9206, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.49317157797189015, + "learning_rate": 0.00019999940277008808, + "loss": 0.859, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.4964622434425981, + "learning_rate": 0.00019999865623437013, + "loss": 0.8302, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.6122813129682467, + "learning_rate": 0.00019999761108748597, + "loss": 0.979, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.6301970834502888, + "learning_rate": 0.00019999626733255662, + "loss": 0.9665, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.6251235739078242, + "learning_rate": 0.00019999462497359466, + "loss": 0.8917, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.5546239147411945, + "learning_rate": 0.00019999268401550447, + "loss": 0.8638, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.6237721194072088, + "learning_rate": 0.000199990444464082, + "loss": 0.9551, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5219863685079097, + "learning_rate": 0.00019998790632601496, + "loss": 0.8254, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.5840916216178993, + "learning_rate": 0.00019998506960888256, + "loss": 0.9045, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5337180935661104, + "learning_rate": 0.00019998193432115572, + "loss": 0.7799, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.8851875594074589, + "learning_rate": 0.0001999785004721968, + "loss": 0.9175, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.6718075048302122, + "learning_rate": 0.00019997476807225985, + "loss": 0.8535, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.5459572976567617, + "learning_rate": 0.0001999707371324904, + "loss": 0.8797, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.6945402602726587, + "learning_rate": 0.00019996640766492543, + "loss": 1.0184, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.5387553405804502, + "learning_rate": 0.00019996177968249334, + "loss": 0.9378, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.5008798819123914, + "learning_rate": 0.0001999568531990141, + "loss": 0.8913, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.6145477921081341, + "learning_rate": 0.00019995162822919883, + "loss": 0.9477, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.547190001995045, + "learning_rate": 0.00019994610478865011, + "loss": 0.8734, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.6009466383707704, + "learning_rate": 0.0001999402828938618, + "loss": 0.8684, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5839831615615629, + "learning_rate": 0.00019993416256221895, + "loss": 0.953, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.584207906860248, + "learning_rate": 0.00019992774381199778, + "loss": 0.9451, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.7531770577811897, + "learning_rate": 0.00019992102666236566, + "loss": 0.9219, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5091828156406875, + "learning_rate": 0.00019991401113338104, + "loss": 0.7954, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.6752524298420287, + "learning_rate": 0.00019990669724599336, + "loss": 0.9881, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.7131387010883815, + "learning_rate": 0.00019989908502204292, + "loss": 1.0035, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6297246197220789, + "learning_rate": 0.00019989117448426108, + "loss": 0.8603, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.5547873349046207, + "learning_rate": 0.00019988296565626987, + "loss": 0.8457, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.5011787007865315, + "learning_rate": 0.00019987445856258206, + "loss": 0.8908, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.4792485673023386, + "learning_rate": 0.00019986565322860115, + "loss": 0.8115, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.47356663334432353, + "learning_rate": 0.00019985654968062122, + "loss": 0.838, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.4668409126247973, + "learning_rate": 0.00019984714794582683, + "loss": 0.8978, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.5711282533560464, + "learning_rate": 0.00019983744805229296, + "loss": 0.8763, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.5850757709665906, + "learning_rate": 0.000199827450028985, + "loss": 0.7986, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.6035424102863467, + "learning_rate": 0.00019981715390575858, + "loss": 0.915, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.7741143540791687, + "learning_rate": 0.00019980655971335945, + "loss": 0.967, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.7394279561513039, + "learning_rate": 0.00019979566748342347, + "loss": 0.9477, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.5187414772692578, + "learning_rate": 0.00019978447724847652, + "loss": 0.8944, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5040103640649286, + "learning_rate": 0.00019977298904193437, + "loss": 0.8896, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.615423569645884, + "learning_rate": 0.00019976120289810247, + "loss": 1.0496, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.5060767129482728, + "learning_rate": 0.00019974911885217608, + "loss": 0.859, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5607251767336862, + "learning_rate": 0.00019973673694024, + "loss": 0.8638, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.5833206272945435, + "learning_rate": 0.0001997240571992685, + "loss": 0.9576, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.6127131832061536, + "learning_rate": 0.00019971107966712518, + "loss": 0.9095, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.6063282263672138, + "learning_rate": 0.00019969780438256293, + "loss": 0.881, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.5593957155103715, + "learning_rate": 0.0001996842313852238, + "loss": 0.8878, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.5384031206814662, + "learning_rate": 0.00019967036071563877, + "loss": 0.8585, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.4974118502751297, + "learning_rate": 0.0001996561924152278, + "loss": 0.8453, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.6566801387881922, + "learning_rate": 0.0001996417265262996, + "loss": 0.8566, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.6465561263035702, + "learning_rate": 0.00019962696309205148, + "loss": 0.9044, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.6948653356023212, + "learning_rate": 0.0001996119021565693, + "loss": 0.9912, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.6676689265229045, + "learning_rate": 0.0001995965437648273, + "loss": 1.052, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.4810496727394801, + "learning_rate": 0.00019958088796268793, + "loss": 0.7299, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.690134541637704, + "learning_rate": 0.0001995649347969019, + "loss": 0.9363, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.6199958083338022, + "learning_rate": 0.00019954868431510764, + "loss": 0.7922, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.7080924436502963, + "learning_rate": 0.00019953213656583168, + "loss": 1.0961, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.646788939725199, + "learning_rate": 0.00019951529159848805, + "loss": 0.8522, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.4989726146441654, + "learning_rate": 0.00019949814946337838, + "loss": 0.8339, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.6511852930206385, + "learning_rate": 0.00019948071021169174, + "loss": 0.8986, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.6161598892774144, + "learning_rate": 0.00019946297389550433, + "loss": 0.9124, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.5033677421045472, + "learning_rate": 0.00019944494056777946, + "loss": 0.8814, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.5746517616929623, + "learning_rate": 0.00019942661028236745, + "loss": 0.9185, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.6343958142028976, + "learning_rate": 0.00019940798309400526, + "loss": 0.9325, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.5994397750129418, + "learning_rate": 0.00019938905905831654, + "loss": 0.8876, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.5107450721069634, + "learning_rate": 0.00019936983823181132, + "loss": 0.8342, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.7435486373478364, + "learning_rate": 0.0001993503206718859, + "loss": 1.0186, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.809182825238528, + "learning_rate": 0.00019933050643682269, + "loss": 1.0035, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.5819253848159949, + "learning_rate": 0.00019931039558578997, + "loss": 0.8812, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.6508723013254191, + "learning_rate": 0.00019928998817884182, + "loss": 0.9668, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.5796337333621042, + "learning_rate": 0.00019926928427691786, + "loss": 0.9341, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.5420371988390913, + "learning_rate": 0.00019924828394184306, + "loss": 0.8384, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.6320018365199501, + "learning_rate": 0.00019922698723632767, + "loss": 0.8598, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.5647761224706593, + "learning_rate": 0.0001992053942239668, + "loss": 0.8693, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.5501288243476312, + "learning_rate": 0.0001991835049692405, + "loss": 0.8387, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5178460421740548, + "learning_rate": 0.00019916131953751342, + "loss": 0.914, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.5346632219276891, + "learning_rate": 0.0001991388379950346, + "loss": 0.8322, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.6523935549795017, + "learning_rate": 0.0001991160604089374, + "loss": 0.8774, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.5679617364387883, + "learning_rate": 0.00019909298684723904, + "loss": 0.8505, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.49663539815551794, + "learning_rate": 0.00019906961737884077, + "loss": 0.8177, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.5217944742403843, + "learning_rate": 0.00019904595207352737, + "loss": 0.8152, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5537561506732283, + "learning_rate": 0.00019902199100196697, + "loss": 0.8768, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.6064073082702489, + "learning_rate": 0.000198997734235711, + "loss": 0.8915, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.5488497227462218, + "learning_rate": 0.00019897318184719385, + "loss": 0.9196, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.6528111270053313, + "learning_rate": 0.00019894833390973266, + "loss": 0.8861, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.5279519427408583, + "learning_rate": 0.0001989231904975272, + "loss": 0.87, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.6185666663340771, + "learning_rate": 0.00019889775168565943, + "loss": 0.8429, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.6171065838197844, + "learning_rate": 0.00019887201755009357, + "loss": 0.8895, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.49653402021215537, + "learning_rate": 0.00019884598816767563, + "loss": 0.8521, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.4832143821851556, + "learning_rate": 0.0001988196636161333, + "loss": 0.8467, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5930464864818148, + "learning_rate": 0.0001987930439740757, + "loss": 0.8942, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.6406968641975115, + "learning_rate": 0.00019876612932099308, + "loss": 0.9863, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.5380431693069204, + "learning_rate": 0.0001987389197372567, + "loss": 1.0119, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.47697223182531917, + "learning_rate": 0.00019871141530411853, + "loss": 0.8244, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.6027764246637539, + "learning_rate": 0.00019868361610371097, + "loss": 0.9495, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.4223573475293651, + "learning_rate": 0.00019865552221904665, + "loss": 0.7734, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.6816218459869196, + "learning_rate": 0.0001986271337340182, + "loss": 1.0553, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.5288915810363807, + "learning_rate": 0.00019859845073339787, + "loss": 0.8751, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.4947913647208401, + "learning_rate": 0.00019856947330283752, + "loss": 0.804, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6904189838107081, + "learning_rate": 0.00019854020152886814, + "loss": 0.9027, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.5992964636587961, + "learning_rate": 0.0001985106354988997, + "loss": 0.8853, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.5348320967845114, + "learning_rate": 0.00019848077530122083, + "loss": 0.8075, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.6298550790096009, + "learning_rate": 0.0001984506210249986, + "loss": 0.9349, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 1.9331938077792006, + "learning_rate": 0.00019842017276027832, + "loss": 0.8624, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.5236801578526996, + "learning_rate": 0.00019838943059798304, + "loss": 0.8768, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4574937323292251, + "learning_rate": 0.00019835839462991361, + "loss": 0.8148, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.6136683690125613, + "learning_rate": 0.0001983270649487481, + "loss": 0.9091, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.4514678873411745, + "learning_rate": 0.0001982954416480417, + "loss": 0.7938, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.4402770181429307, + "learning_rate": 0.00019826352482222638, + "loss": 0.7222, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.548887864087565, + "learning_rate": 0.00019823131456661063, + "loss": 0.8248, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.5562661508499465, + "learning_rate": 0.00019819881097737915, + "loss": 0.9384, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.6033629683305896, + "learning_rate": 0.00019816601415159263, + "loss": 0.9273, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.8342562362215857, + "learning_rate": 0.00019813292418718732, + "loss": 1.1554, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.5069447856597911, + "learning_rate": 0.0001980995411829749, + "loss": 0.9092, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5734351306353567, + "learning_rate": 0.0001980658652386421, + "loss": 0.8657, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.48674893805867275, + "learning_rate": 0.0001980318964547504, + "loss": 0.8155, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.5894232151359743, + "learning_rate": 0.0001979976349327357, + "loss": 0.9222, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.6063045680692516, + "learning_rate": 0.00019796308077490817, + "loss": 0.9494, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.5938132305924131, + "learning_rate": 0.00019792823408445174, + "loss": 0.9302, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.5913801841305889, + "learning_rate": 0.0001978930949654239, + "loss": 0.8658, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.9822282297619556, + "learning_rate": 0.00019785766352275542, + "loss": 1.0164, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.6757553741126089, + "learning_rate": 0.00019782193986224995, + "loss": 0.9826, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 1.233905974997222, + "learning_rate": 0.00019778592409058378, + "loss": 0.8634, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.7286142841214036, + "learning_rate": 0.00019774961631530545, + "loss": 0.9223, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.45484263786353385, + "learning_rate": 0.0001977130166448355, + "loss": 0.7534, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.5711816184206053, + "learning_rate": 0.00019767612518846608, + "loss": 0.8735, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.5418657935134991, + "learning_rate": 0.00019763894205636072, + "loss": 0.9992, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.6185597952660713, + "learning_rate": 0.00019760146735955388, + "loss": 0.8937, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.516031279053156, + "learning_rate": 0.00019756370120995066, + "loss": 0.9806, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5599066265752062, + "learning_rate": 0.00019752564372032657, + "loss": 0.8679, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.5630464137659468, + "learning_rate": 0.000197487295004327, + "loss": 0.8109, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.4814995495211024, + "learning_rate": 0.00019744865517646706, + "loss": 0.7618, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4970701269923393, + "learning_rate": 0.00019740972435213115, + "loss": 0.7755, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.6707479462374684, + "learning_rate": 0.0001973705026475726, + "loss": 0.9362, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.645234890474023, + "learning_rate": 0.00019733099017991341, + "loss": 0.9033, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5395230204710006, + "learning_rate": 0.00019729118706714375, + "loss": 0.7949, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.550382701125016, + "learning_rate": 0.0001972510934281218, + "loss": 0.8044, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.5418835757233681, + "learning_rate": 0.00019721070938257324, + "loss": 0.9103, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.5149849534792273, + "learning_rate": 0.00019717003505109095, + "loss": 0.8234, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.6222034792808916, + "learning_rate": 0.0001971290705551347, + "loss": 0.9314, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.5020387021600372, + "learning_rate": 0.00019708781601703065, + "loss": 0.8834, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.44609067118781853, + "learning_rate": 0.00019704627155997108, + "loss": 0.7588, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.59475354515924, + "learning_rate": 0.00019700443730801413, + "loss": 0.8931, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.6103536228444102, + "learning_rate": 0.00019696231338608316, + "loss": 0.967, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.4276077516316479, + "learning_rate": 0.00019691989991996663, + "loss": 0.7639, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.42679574249455227, + "learning_rate": 0.00019687719703631755, + "loss": 0.7505, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.5195748925315388, + "learning_rate": 0.00019683420486265327, + "loss": 0.892, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5430095858621583, + "learning_rate": 0.0001967909235273549, + "loss": 0.8572, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.5490698318145699, + "learning_rate": 0.0001967473531596671, + "loss": 0.7836, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.5810676797287423, + "learning_rate": 0.0001967034938896976, + "loss": 0.9217, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5397056603391304, + "learning_rate": 0.00019665934584841682, + "loss": 0.8318, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.6852953334227522, + "learning_rate": 0.0001966149091676575, + "loss": 0.987, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.5397841521812882, + "learning_rate": 0.00019657018398011434, + "loss": 0.8567, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.5558183153152181, + "learning_rate": 0.00019652517041934356, + "loss": 0.889, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.5323335553062883, + "learning_rate": 0.00019647986861976246, + "loss": 0.8292, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.6359392350933629, + "learning_rate": 0.0001964342787166491, + "loss": 0.8789, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5157272312694805, + "learning_rate": 0.00019638840084614182, + "loss": 0.8382, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.4955543768095666, + "learning_rate": 0.0001963422351452389, + "loss": 0.827, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.5185766880013812, + "learning_rate": 0.0001962957817517982, + "loss": 0.8525, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5799216700682058, + "learning_rate": 0.00019624904080453655, + "loss": 0.8949, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.5746162775961909, + "learning_rate": 0.00019620201244302952, + "loss": 0.9068, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.5942962281260388, + "learning_rate": 0.00019615469680771096, + "loss": 0.9105, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4822965949841608, + "learning_rate": 0.00019610709403987246, + "loss": 0.8006, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.5285129624324617, + "learning_rate": 0.00019605920428166323, + "loss": 0.8916, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.578767555547315, + "learning_rate": 0.00019601102767608923, + "loss": 0.8295, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5726253586422992, + "learning_rate": 0.00019596256436701324, + "loss": 0.7637, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.5803865400279506, + "learning_rate": 0.00019591381449915397, + "loss": 0.847, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.6238889881153514, + "learning_rate": 0.00019586477821808597, + "loss": 0.9309, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.5693911379870041, + "learning_rate": 0.000195815455670239, + "loss": 0.8328, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.6019961384633078, + "learning_rate": 0.00019576584700289768, + "loss": 0.739, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.49021989674917477, + "learning_rate": 0.00019571595236420102, + "loss": 0.8646, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5152822790823023, + "learning_rate": 0.00019566577190314197, + "loss": 0.8294, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.49830881392059656, + "learning_rate": 0.00019561530576956703, + "loss": 0.849, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.6353438291466692, + "learning_rate": 0.00019556455411417573, + "loss": 0.8289, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.5496078604729235, + "learning_rate": 0.0001955135170885202, + "loss": 0.855, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.5328663723300848, + "learning_rate": 0.00019546219484500475, + "loss": 0.8204, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.6044824618670902, + "learning_rate": 0.00019541058753688538, + "loss": 0.9029, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.562340865698981, + "learning_rate": 0.00019535869531826937, + "loss": 0.8995, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.5694336641591617, + "learning_rate": 0.00019530651834411474, + "loss": 0.8111, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.5493525419976079, + "learning_rate": 0.00019525405677022989, + "loss": 0.8504, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.629488795640096, + "learning_rate": 0.00019520131075327298, + "loss": 0.8496, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.4705893683305386, + "learning_rate": 0.0001951482804507517, + "loss": 0.7404, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.514550204082626, + "learning_rate": 0.00019509496602102252, + "loss": 0.8082, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.6060127960470112, + "learning_rate": 0.00019504136762329047, + "loss": 0.8902, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.4152489103408303, + "learning_rate": 0.00019498748541760846, + "loss": 0.8229, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.45642913586332107, + "learning_rate": 0.0001949333195648769, + "loss": 0.7538, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.5557390666476755, + "learning_rate": 0.00019487887022684336, + "loss": 0.8656, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.5483083085505757, + "learning_rate": 0.00019482413756610173, + "loss": 0.8621, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.6217623928148703, + "learning_rate": 0.0001947691217460921, + "loss": 0.9082, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5544728011159186, + "learning_rate": 0.00019471382293110003, + "loss": 0.9426, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.6386585001110127, + "learning_rate": 0.00019465824128625617, + "loss": 0.8571, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.5364273119583183, + "learning_rate": 0.00019460237697753577, + "loss": 0.8107, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4912213309565991, + "learning_rate": 0.00019454623017175812, + "loss": 0.8051, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.5891785869772029, + "learning_rate": 0.00019448980103658613, + "loss": 0.915, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.48733214475456743, + "learning_rate": 0.0001944330897405257, + "loss": 0.8577, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5799548544326986, + "learning_rate": 0.00019437609645292546, + "loss": 0.825, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.4788858732810099, + "learning_rate": 0.00019431882134397598, + "loss": 0.9298, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.5883545906221429, + "learning_rate": 0.00019426126458470936, + "loss": 0.9316, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.4904291459104027, + "learning_rate": 0.0001942034263469989, + "loss": 0.8268, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.7115946528102219, + "learning_rate": 0.00019414530680355837, + "loss": 1.026, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.40575516082874447, + "learning_rate": 0.00019408690612794148, + "loss": 0.7482, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5403346585965954, + "learning_rate": 0.00019402822449454153, + "loss": 0.9077, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.6662237887549459, + "learning_rate": 0.00019396926207859084, + "loss": 0.9702, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.6942651389363136, + "learning_rate": 0.0001939100190561601, + "loss": 0.9222, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.6579007152763529, + "learning_rate": 0.00019385049560415794, + "loss": 0.9697, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.5496149330120147, + "learning_rate": 0.0001937906919003304, + "loss": 0.9229, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.4490156799705011, + "learning_rate": 0.00019373060812326052, + "loss": 0.7789, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4781576375838312, + "learning_rate": 0.00019367024445236754, + "loss": 0.8383, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.4795905566479139, + "learning_rate": 0.00019360960106790643, + "loss": 0.8123, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.8181773682867574, + "learning_rate": 0.0001935486781509677, + "loss": 1.0112, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.522706657600922, + "learning_rate": 0.00019348747588347637, + "loss": 0.8527, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.5135054349379268, + "learning_rate": 0.00019342599444819168, + "loss": 0.7414, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.5130302385142596, + "learning_rate": 0.00019336423402870653, + "loss": 0.7821, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.5547056633661651, + "learning_rate": 0.00019330219480944694, + "loss": 0.9122, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.6441345254417032, + "learning_rate": 0.0001932398769756714, + "loss": 1.0426, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.5273823414095974, + "learning_rate": 0.0001931772807134704, + "loss": 0.9104, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.46069046608844544, + "learning_rate": 0.00019311440620976597, + "loss": 0.7569, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.5381318342085369, + "learning_rate": 0.00019305125365231084, + "loss": 0.9187, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.7487514917933524, + "learning_rate": 0.00019298782322968815, + "loss": 1.0041, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.47966687804112507, + "learning_rate": 0.0001929241151313108, + "loss": 0.8117, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.6273165405318113, + "learning_rate": 0.0001928601295474208, + "loss": 1.0189, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.5670914286563735, + "learning_rate": 0.00019279586666908884, + "loss": 0.8833, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.7232675325632234, + "learning_rate": 0.00019273132668821364, + "loss": 0.8313, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.5121007508125893, + "learning_rate": 0.00019266650979752136, + "loss": 0.8204, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.5128296692744078, + "learning_rate": 0.00019260141619056507, + "loss": 0.8674, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5204816772309242, + "learning_rate": 0.00019253604606172417, + "loss": 0.8232, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.5473040774478453, + "learning_rate": 0.0001924703996062038, + "loss": 0.8777, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.6102177275322413, + "learning_rate": 0.0001924044770200342, + "loss": 1.04, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.5382713447028439, + "learning_rate": 0.00019233827850007027, + "loss": 0.759, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.5637183014231405, + "learning_rate": 0.0001922718042439908, + "loss": 0.7745, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.5005955166838629, + "learning_rate": 0.000192205054450298, + "loss": 0.8215, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5647214750330327, + "learning_rate": 0.00019213802931831696, + "loss": 0.8736, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.5430735915648874, + "learning_rate": 0.00019207072904819486, + "loss": 0.8685, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.4985562198119292, + "learning_rate": 0.00019200315384090044, + "loss": 0.8372, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.5410096511416426, + "learning_rate": 0.00019193530389822363, + "loss": 0.9444, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.5400865975872126, + "learning_rate": 0.00019186717942277462, + "loss": 0.828, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.5199681853342951, + "learning_rate": 0.00019179878061798347, + "loss": 0.7628, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.6263122336192022, + "learning_rate": 0.00019173010768809933, + "loss": 0.9499, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.5515913599293975, + "learning_rate": 0.00019166116083819002, + "loss": 0.902, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.540077229662397, + "learning_rate": 0.00019159194027414128, + "loss": 0.8651, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.5105953773814097, + "learning_rate": 0.0001915224462026563, + "loss": 0.8124, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.5885775121761744, + "learning_rate": 0.00019145267883125482, + "loss": 0.8779, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.5468317477972652, + "learning_rate": 0.00019138263836827288, + "loss": 0.9038, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.4774898883273422, + "learning_rate": 0.00019131232502286188, + "loss": 0.7347, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.6546369687247534, + "learning_rate": 0.00019124173900498818, + "loss": 1.074, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.5746219704096867, + "learning_rate": 0.00019117088052543233, + "loss": 0.8785, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.512783055857806, + "learning_rate": 0.0001910997497957885, + "loss": 0.8484, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.48318252093811825, + "learning_rate": 0.00019102834702846387, + "loss": 0.8693, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.4552038052381454, + "learning_rate": 0.0001909566724366779, + "loss": 0.8363, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.48773217916485373, + "learning_rate": 0.00019088472623446183, + "loss": 0.799, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.6014093204013539, + "learning_rate": 0.00019081250863665794, + "loss": 0.8907, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.6155458246007832, + "learning_rate": 0.0001907400198589189, + "loss": 0.9374, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.42801125846418625, + "learning_rate": 0.00019066726011770726, + "loss": 0.7845, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.4949972099285363, + "learning_rate": 0.00019059422963029464, + "loss": 0.872, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.5035166804923811, + "learning_rate": 0.0001905209286147611, + "loss": 0.9066, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.47353741325219184, + "learning_rate": 0.0001904473572899947, + "loss": 0.7517, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.5736697688223861, + "learning_rate": 0.0001903735158756905, + "loss": 0.958, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.53247859954624, + "learning_rate": 0.0001902994045923502, + "loss": 0.8478, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.6010734943757549, + "learning_rate": 0.00019022502366128135, + "loss": 0.8478, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.5121525604394572, + "learning_rate": 0.0001901503733045967, + "loss": 0.7919, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.46694379864441826, + "learning_rate": 0.00019007545374521355, + "loss": 0.7731, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5485906739730657, + "learning_rate": 0.00019000026520685302, + "loss": 0.8151, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.4585234757479113, + "learning_rate": 0.00018992480791403958, + "loss": 0.7339, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.5940002138588427, + "learning_rate": 0.0001898490820921001, + "loss": 0.8679, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.6057502149185376, + "learning_rate": 0.0001897730879671634, + "loss": 0.8719, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.5810819000531806, + "learning_rate": 0.0001896968257661595, + "loss": 0.9198, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.6083936326137559, + "learning_rate": 0.00018962029571681886, + "loss": 0.86, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.6865828858997022, + "learning_rate": 0.00018954349804767184, + "loss": 0.9054, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.566563080612365, + "learning_rate": 0.00018946643298804793, + "loss": 0.8687, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.5006268810917645, + "learning_rate": 0.00018938910076807513, + "loss": 0.7906, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.5018590669905312, + "learning_rate": 0.00018931150161867916, + "loss": 0.8252, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.5965473876338885, + "learning_rate": 0.0001892336357715829, + "loss": 0.986, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.4330352467359094, + "learning_rate": 0.0001891555034593055, + "loss": 0.7868, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.6268359083673914, + "learning_rate": 0.00018907710491516199, + "loss": 0.9152, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.580180599972275, + "learning_rate": 0.00018899844037326225, + "loss": 0.885, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.4364646368352817, + "learning_rate": 0.0001889195100685106, + "loss": 0.7072, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.5736322025691719, + "learning_rate": 0.0001888403142366049, + "loss": 0.7942, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.6768535971383288, + "learning_rate": 0.00018876085311403593, + "loss": 0.9545, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.4679683938709085, + "learning_rate": 0.00018868112693808665, + "loss": 0.7854, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4719121618259123, + "learning_rate": 0.00018860113594683148, + "loss": 0.7323, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.4632021599885029, + "learning_rate": 0.00018852088037913577, + "loss": 0.7609, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.5804129077627767, + "learning_rate": 0.0001884403604746547, + "loss": 0.9667, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.4169465814255522, + "learning_rate": 0.00018835957647383303, + "loss": 0.7773, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.6055416889203082, + "learning_rate": 0.00018827852861790398, + "loss": 0.9501, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.47097022652856235, + "learning_rate": 0.00018819721714888877, + "loss": 0.8522, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4711804717392996, + "learning_rate": 0.00018811564230959588, + "loss": 0.729, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.5833928473869843, + "learning_rate": 0.00018803380434362, + "loss": 0.9746, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.43197057031279645, + "learning_rate": 0.0001879517034953418, + "loss": 0.7755, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.6187166684792078, + "learning_rate": 0.00018786934000992688, + "loss": 0.85, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.4964059310236803, + "learning_rate": 0.00018778671413332513, + "loss": 0.7622, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.503061098176286, + "learning_rate": 0.00018770382611226987, + "loss": 0.8098, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4621277025640317, + "learning_rate": 0.00018762067619427746, + "loss": 0.786, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.5436191215591101, + "learning_rate": 0.000187537264627646, + "loss": 0.9056, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.48810332750586005, + "learning_rate": 0.00018745359166145523, + "loss": 0.7994, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.473568239322871, + "learning_rate": 0.00018736965754556528, + "loss": 0.8444, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.5586622378076873, + "learning_rate": 0.00018728546253061614, + "loss": 0.8522, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.4281119912363646, + "learning_rate": 0.00018720100686802694, + "loss": 0.7804, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5715157096974114, + "learning_rate": 0.00018711629080999504, + "loss": 0.8718, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.5316707554900694, + "learning_rate": 0.00018703131460949554, + "loss": 0.8543, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.5438502162254243, + "learning_rate": 0.0001869460785202802, + "loss": 0.8143, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4961794864266288, + "learning_rate": 0.00018686058279687698, + "loss": 0.7625, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.5627823210958318, + "learning_rate": 0.00018677482769458904, + "loss": 0.8573, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.8146216841274553, + "learning_rate": 0.00018668881346949417, + "loss": 0.9443, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.6009169654207878, + "learning_rate": 0.00018660254037844388, + "loss": 0.9283, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.41578777077511664, + "learning_rate": 0.00018651600867906272, + "loss": 0.7357, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.666050224471447, + "learning_rate": 0.00018642921862974742, + "loss": 0.8986, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.47970987588672004, + "learning_rate": 0.00018634217048966637, + "loss": 0.8287, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.479860868749402, + "learning_rate": 0.00018625486451875843, + "loss": 0.8108, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.5884506522951436, + "learning_rate": 0.0001861673009777325, + "loss": 0.9642, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5510794845241416, + "learning_rate": 0.0001860794801280666, + "loss": 0.8743, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.5795214491241171, + "learning_rate": 0.00018599140223200716, + "loss": 0.8866, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.5344279213250357, + "learning_rate": 0.0001859030675525681, + "loss": 0.75, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5005010601614996, + "learning_rate": 0.0001858144763535302, + "loss": 0.838, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.6184079702216735, + "learning_rate": 0.0001857256288994402, + "loss": 0.9063, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.5707216840794579, + "learning_rate": 0.00018563652545561013, + "loss": 0.9228, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.45330242748343846, + "learning_rate": 0.0001855471662881164, + "loss": 0.7376, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.6088262432419314, + "learning_rate": 0.000185457551663799, + "loss": 0.9296, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.4531406961541151, + "learning_rate": 0.00018536768185026083, + "loss": 0.7553, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.5647055757625112, + "learning_rate": 0.00018527755711586678, + "loss": 0.8885, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.5882369341941862, + "learning_rate": 0.00018518717772974302, + "loss": 0.8576, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.5400256164599937, + "learning_rate": 0.00018509654396177609, + "loss": 0.8778, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.676537394010591, + "learning_rate": 0.00018500565608261214, + "loss": 0.9723, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.5167315801994107, + "learning_rate": 0.00018491451436365627, + "loss": 0.9023, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.47234634295211225, + "learning_rate": 0.0001848231190770714, + "loss": 0.7748, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5752326204538213, + "learning_rate": 0.00018473147049577774, + "loss": 0.8052, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.48278342600159446, + "learning_rate": 0.00018463956889345194, + "loss": 0.8469, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.5747001961815094, + "learning_rate": 0.00018454741454452603, + "loss": 0.884, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.63448866592384, + "learning_rate": 0.00018445500772418697, + "loss": 0.8585, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.4819688568653652, + "learning_rate": 0.00018436234870837547, + "loss": 0.7211, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.5435821237712569, + "learning_rate": 0.00018426943777378552, + "loss": 0.8053, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4670677897534823, + "learning_rate": 0.00018417627519786315, + "loss": 0.8869, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.5443572205211442, + "learning_rate": 0.00018408286125880604, + "loss": 0.9102, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.5908846609114969, + "learning_rate": 0.00018398919623556238, + "loss": 0.7811, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.5765923750964485, + "learning_rate": 0.00018389528040783012, + "loss": 1.0941, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.5138817115120994, + "learning_rate": 0.0001838011140560562, + "loss": 0.7936, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.5995044651962633, + "learning_rate": 0.00018370669746143564, + "loss": 0.8256, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5285618582126465, + "learning_rate": 0.00018361203090591071, + "loss": 0.8912, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.6023859216088946, + "learning_rate": 0.0001835171146721701, + "loss": 0.7781, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.5007290710095313, + "learning_rate": 0.00018342194904364813, + "loss": 0.7527, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5228675855644663, + "learning_rate": 0.00018332653430452376, + "loss": 0.8552, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.5496498789032858, + "learning_rate": 0.00018323087073971993, + "loss": 0.8445, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.5220544188374293, + "learning_rate": 0.00018313495863490258, + "loss": 0.8247, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.5067334422083659, + "learning_rate": 0.00018303879827647975, + "loss": 0.8302, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.4691172286804912, + "learning_rate": 0.00018294238995160094, + "loss": 0.7837, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.5330880084610178, + "learning_rate": 0.00018284573394815597, + "loss": 0.8542, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4642122947986411, + "learning_rate": 0.00018274883055477436, + "loss": 0.7679, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.43891395287461854, + "learning_rate": 0.00018265168006082437, + "loss": 0.7624, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4237105239650998, + "learning_rate": 0.00018255428275641214, + "loss": 0.7488, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.5130046523239726, + "learning_rate": 0.00018245663893238075, + "loss": 0.7836, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.6317362602782209, + "learning_rate": 0.0001823587488803095, + "loss": 0.8742, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.5580357630858312, + "learning_rate": 0.00018226061289251298, + "loss": 0.738, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5387490413701905, + "learning_rate": 0.00018216223126204007, + "loss": 0.8758, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.5047434117071069, + "learning_rate": 0.00018206360428267332, + "loss": 0.874, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.5941421898634975, + "learning_rate": 0.00018196473224892784, + "loss": 0.8613, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.5574177582023201, + "learning_rate": 0.00018186561545605054, + "loss": 0.8667, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.5173676833274382, + "learning_rate": 0.0001817662542000192, + "loss": 0.8572, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.4860014045365757, + "learning_rate": 0.0001816666487775416, + "loss": 0.7569, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.6180223002740645, + "learning_rate": 0.00018156679948605467, + "loss": 0.8477, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.5609024155241303, + "learning_rate": 0.00018146670662372354, + "loss": 0.8657, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.47057323212256735, + "learning_rate": 0.0001813663704894407, + "loss": 0.7562, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4945005490935583, + "learning_rate": 0.00018126579138282503, + "loss": 0.8207, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.6133187947117877, + "learning_rate": 0.00018116496960422107, + "loss": 0.8199, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.6911757752607458, + "learning_rate": 0.00018106390545469795, + "loss": 0.87, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.6137929993298187, + "learning_rate": 0.0001809625992360485, + "loss": 0.837, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.5501024759025723, + "learning_rate": 0.00018086105125078857, + "loss": 0.7912, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.5063033271652045, + "learning_rate": 0.00018075926180215576, + "loss": 0.7805, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.510137173717112, + "learning_rate": 0.00018065723119410884, + "loss": 0.7797, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.6157392638015244, + "learning_rate": 0.0001805549597313267, + "loss": 0.9578, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.622176074552984, + "learning_rate": 0.0001804524477192075, + "loss": 0.8868, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.6176511393183507, + "learning_rate": 0.00018034969546386757, + "loss": 0.7839, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.6022582377918664, + "learning_rate": 0.00018024670327214084, + "loss": 0.9504, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.6117420394311651, + "learning_rate": 0.00018014347145157755, + "loss": 0.8724, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4613306447590751, + "learning_rate": 0.0001800400003104436, + "loss": 0.7111, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.49274299642677505, + "learning_rate": 0.0001799362901577196, + "loss": 0.9149, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.5144909057081801, + "learning_rate": 0.00017983234130309968, + "loss": 0.9043, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4574806071235767, + "learning_rate": 0.00017972815405699103, + "loss": 0.824, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.6321660311909871, + "learning_rate": 0.00017962372873051252, + "loss": 0.9061, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.43513411301300586, + "learning_rate": 0.00017951906563549397, + "loss": 0.7199, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.6248157856692813, + "learning_rate": 0.00017941416508447536, + "loss": 0.8101, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.47038352398190103, + "learning_rate": 0.00017930902739070562, + "loss": 0.7837, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.5856396926417063, + "learning_rate": 0.00017920365286814183, + "loss": 0.8927, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4976973531467887, + "learning_rate": 0.0001790980418314484, + "loss": 0.8334, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.4818085061614485, + "learning_rate": 0.0001789921945959958, + "loss": 0.7827, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.47692623522400623, + "learning_rate": 0.00017888611147786002, + "loss": 0.7826, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5466326203656854, + "learning_rate": 0.00017877979279382135, + "loss": 0.8243, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.5152162988773265, + "learning_rate": 0.00017867323886136348, + "loss": 0.7567, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.45388835284062296, + "learning_rate": 0.00017856644999867264, + "loss": 0.7695, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5853618309402371, + "learning_rate": 0.0001784594265246366, + "loss": 0.8208, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.6106932999990901, + "learning_rate": 0.00017835216875884368, + "loss": 0.867, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.4874600419982771, + "learning_rate": 0.0001782446770215819, + "loss": 0.7949, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4817403222328486, + "learning_rate": 0.0001781369516338378, + "loss": 0.7809, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.4685691151727727, + "learning_rate": 0.00017802899291729585, + "loss": 0.7372, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.6095969024234741, + "learning_rate": 0.0001779208011943371, + "loss": 0.7889, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.6301026106350284, + "learning_rate": 0.00017781237678803847, + "loss": 0.9439, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.558599760329991, + "learning_rate": 0.00017770372002217172, + "loss": 0.8016, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.5867696843362354, + "learning_rate": 0.00017759483122120238, + "loss": 0.9291, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4542271311118376, + "learning_rate": 0.000177485710710289, + "loss": 0.831, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.48078456375330836, + "learning_rate": 0.00017737635881528196, + "loss": 0.8061, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.5837470567030518, + "learning_rate": 0.00017726677586272263, + "loss": 0.8194, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.590019028646285, + "learning_rate": 0.00017715696217984235, + "loss": 0.8989, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.5201869075062091, + "learning_rate": 0.00017704691809456143, + "loss": 0.8389, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.5456899747750916, + "learning_rate": 0.0001769366439354882, + "loss": 0.8175, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.5379656453512448, + "learning_rate": 0.00017682614003191807, + "loss": 0.8548, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.504279218630493, + "learning_rate": 0.00017671540671383243, + "loss": 0.8189, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.5100218415175828, + "learning_rate": 0.0001766044443118978, + "loss": 0.7222, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5356827307519676, + "learning_rate": 0.00017649325315746478, + "loss": 0.9023, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.4764855230388838, + "learning_rate": 0.00017638183358256696, + "loss": 0.8479, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.575984961535713, + "learning_rate": 0.00017627018591992018, + "loss": 0.7998, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.5896885652530898, + "learning_rate": 0.0001761583105029213, + "loss": 0.8793, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.4369359609408987, + "learning_rate": 0.00017604620766564723, + "loss": 0.7633, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.6604020258812308, + "learning_rate": 0.00017593387774285412, + "loss": 1.0006, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.6622083940456467, + "learning_rate": 0.00017582132106997616, + "loss": 0.8981, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.6256728641179665, + "learning_rate": 0.0001757085379831246, + "loss": 0.9839, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.6290910760399225, + "learning_rate": 0.00017559552881908695, + "loss": 0.8761, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.5272851956074754, + "learning_rate": 0.00017548229391532572, + "loss": 0.8193, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.46195198232319007, + "learning_rate": 0.00017536883360997743, + "loss": 0.7859, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.520073020384154, + "learning_rate": 0.00017525514824185185, + "loss": 0.8488, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.6085112558608898, + "learning_rate": 0.00017514123815043074, + "loss": 0.868, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.5207737793312851, + "learning_rate": 0.00017502710367586687, + "loss": 0.8593, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.5767838902749421, + "learning_rate": 0.0001749127451589832, + "loss": 0.7459, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.3947273347232596, + "learning_rate": 0.00017479816294127152, + "loss": 0.7597, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.5804785627383354, + "learning_rate": 0.00017468335736489177, + "loss": 0.8237, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.4714559275309985, + "learning_rate": 0.00017456832877267084, + "loss": 0.7953, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.47567972325860264, + "learning_rate": 0.0001744530775081015, + "loss": 0.8037, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.8071744068960827, + "learning_rate": 0.00017433760391534167, + "loss": 1.1113, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.47927549431497496, + "learning_rate": 0.00017422190833921283, + "loss": 0.7565, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.542472938443313, + "learning_rate": 0.0001741059911251997, + "loss": 0.8184, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.7695927800463104, + "learning_rate": 0.00017398985261944856, + "loss": 0.9872, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.6358222932380025, + "learning_rate": 0.00017387349316876666, + "loss": 0.8552, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5226165286628119, + "learning_rate": 0.000173756913120621, + "loss": 0.8495, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.5529408183593187, + "learning_rate": 0.0001736401128231373, + "loss": 0.8307, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.5398514940998124, + "learning_rate": 0.00017352309262509894, + "loss": 0.8691, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.499866112005662, + "learning_rate": 0.00017340585287594604, + "loss": 0.8597, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.4269871088798527, + "learning_rate": 0.0001732883939257742, + "loss": 0.7545, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.5058226215124434, + "learning_rate": 0.0001731707161253338, + "loss": 0.8474, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5749674216515905, + "learning_rate": 0.0001730528198260285, + "loss": 0.8277, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.5020270362004025, + "learning_rate": 0.00017293470537991463, + "loss": 0.8762, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.5750060859703835, + "learning_rate": 0.00017281637313969978, + "loss": 0.854, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.544859109216088, + "learning_rate": 0.00017269782345874203, + "loss": 0.8573, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.42039803428169764, + "learning_rate": 0.00017257905669104874, + "loss": 0.7308, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.514802162289825, + "learning_rate": 0.00017246007319127545, + "loss": 0.688, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5132339873084991, + "learning_rate": 0.00017234087331472497, + "loss": 0.7364, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.6706884030316487, + "learning_rate": 0.00017222145741734626, + "loss": 0.8348, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.5781843065650732, + "learning_rate": 0.00017210182585573327, + "loss": 0.8212, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.5275680410726457, + "learning_rate": 0.00017198197898712404, + "loss": 0.8587, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.6370838075439973, + "learning_rate": 0.00017186191716939944, + "loss": 0.9092, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.5015890105789446, + "learning_rate": 0.0001717416407610824, + "loss": 0.8558, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4962255759207231, + "learning_rate": 0.00017162115012133643, + "loss": 0.8169, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.4629324862616736, + "learning_rate": 0.00017150044560996488, + "loss": 0.7627, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.4790462376610464, + "learning_rate": 0.00017137952758740978, + "loss": 0.7806, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4589193456933277, + "learning_rate": 0.00017125839641475072, + "loss": 0.8131, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.52371605378381, + "learning_rate": 0.00017113705245370368, + "loss": 0.8487, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.6854660099008522, + "learning_rate": 0.00017101549606662024, + "loss": 1.0226, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.58934966996955, + "learning_rate": 0.00017089372761648616, + "loss": 0.9247, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.5402264313064337, + "learning_rate": 0.00017077174746692056, + "loss": 0.8788, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.47957437403783126, + "learning_rate": 0.00017064955598217462, + "loss": 0.8203, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.44709841455364296, + "learning_rate": 0.00017052715352713075, + "loss": 0.7414, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.5332362597845244, + "learning_rate": 0.00017040454046730115, + "loss": 0.8195, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.5904914584094229, + "learning_rate": 0.00017028171716882714, + "loss": 0.9734, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5342974518753195, + "learning_rate": 0.00017015868399847768, + "loss": 0.8036, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.6386296051221013, + "learning_rate": 0.00017003544132364846, + "loss": 0.8332, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.5264806057403153, + "learning_rate": 0.00016991198951236088, + "loss": 0.8166, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.6546698223767454, + "learning_rate": 0.00016978832893326074, + "loss": 0.8983, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.5634840483102529, + "learning_rate": 0.00016966445995561727, + "loss": 0.8522, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.4733268974825999, + "learning_rate": 0.00016954038294932216, + "loss": 0.7678, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5655096983871343, + "learning_rate": 0.00016941609828488807, + "loss": 0.7449, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.47868263433954594, + "learning_rate": 0.0001692916063334479, + "loss": 0.8203, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.5266609333720934, + "learning_rate": 0.0001691669074667535, + "loss": 0.82, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.6157752040080987, + "learning_rate": 0.0001690420020571747, + "loss": 0.7982, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.559279468125684, + "learning_rate": 0.0001689168904776979, + "loss": 0.7989, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.5360024942522439, + "learning_rate": 0.00016879157310192535, + "loss": 0.7937, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5096612921083854, + "learning_rate": 0.0001686660503040737, + "loss": 0.7727, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.8784262818664934, + "learning_rate": 0.00016854032245897308, + "loss": 0.9953, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.6994689219107383, + "learning_rate": 0.00016841438994206595, + "loss": 0.9399, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.46459927394998424, + "learning_rate": 0.00016828825312940592, + "loss": 0.7893, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.5935554208834619, + "learning_rate": 0.00016816191239765667, + "loss": 0.896, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.4270487738727027, + "learning_rate": 0.00016803536812409075, + "loss": 0.7317, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4996296503595494, + "learning_rate": 0.0001679086206865886, + "loss": 0.8214, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.5982774004385281, + "learning_rate": 0.00016778167046363734, + "loss": 0.9189, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.4706957694534351, + "learning_rate": 0.00016765451783432953, + "loss": 0.8101, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.549213850147289, + "learning_rate": 0.00016752716317836229, + "loss": 0.8732, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.5857748532390733, + "learning_rate": 0.0001673996068760359, + "loss": 0.7735, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.5086801122280594, + "learning_rate": 0.00016727184930825288, + "loss": 0.7906, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.5138457768924369, + "learning_rate": 0.0001671438908565167, + "loss": 0.8575, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.5113291229424737, + "learning_rate": 0.00016701573190293077, + "loss": 0.8191, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.5520767593214518, + "learning_rate": 0.00016688737283019706, + "loss": 0.8266, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.47561911014972424, + "learning_rate": 0.00016675881402161536, + "loss": 0.7163, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.5824368385480034, + "learning_rate": 0.00016663005586108176, + "loss": 0.9457, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.5586463030276689, + "learning_rate": 0.00016650109873308765, + "loss": 0.8331, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4638270088383452, + "learning_rate": 0.0001663719430227186, + "loss": 0.8051, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.5286869926090003, + "learning_rate": 0.0001662425891156531, + "loss": 0.8007, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.4357724563674736, + "learning_rate": 0.00016611303739816168, + "loss": 0.7317, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.37164510925752114, + "learning_rate": 0.00016598328825710533, + "loss": 0.7168, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.48288239001177796, + "learning_rate": 0.00016585334207993476, + "loss": 0.801, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.5043367237233689, + "learning_rate": 0.00016572319925468892, + "loss": 0.7983, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5419864242698199, + "learning_rate": 0.000165592860169994, + "loss": 0.8818, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.43418777180724477, + "learning_rate": 0.0001654623252150624, + "loss": 0.7404, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.5173208460101304, + "learning_rate": 0.00016533159477969122, + "loss": 0.8224, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.5504684985819356, + "learning_rate": 0.00016520066925426144, + "loss": 0.7496, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.42604835139136, + "learning_rate": 0.00016506954902973655, + "loss": 0.8319, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.47856812166759943, + "learning_rate": 0.00016493823449766136, + "loss": 0.7996, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.5218824605992826, + "learning_rate": 0.0001648067260501611, + "loss": 0.7575, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.7257298399405144, + "learning_rate": 0.00016467502407993992, + "loss": 0.8362, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.5599836703147796, + "learning_rate": 0.0001645431289802799, + "loss": 0.8223, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.5962035431992888, + "learning_rate": 0.0001644110411450398, + "loss": 0.7909, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.5824597748075987, + "learning_rate": 0.00016427876096865394, + "loss": 0.8891, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.4907512680751975, + "learning_rate": 0.00016414628884613107, + "loss": 0.7922, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.6603540202315609, + "learning_rate": 0.00016401362517305296, + "loss": 0.9001, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.47200466542824876, + "learning_rate": 0.00016388077034557355, + "loss": 0.7159, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.6394882361961985, + "learning_rate": 0.00016374772476041748, + "loss": 0.8674, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5228457915772745, + "learning_rate": 0.00016361448881487914, + "loss": 0.833, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.49909695192505343, + "learning_rate": 0.00016348106290682118, + "loss": 0.8456, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.5084880904551904, + "learning_rate": 0.00016334744743467364, + "loss": 0.7277, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.4544296233646207, + "learning_rate": 0.00016321364279743266, + "loss": 0.8133, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.5851864836675709, + "learning_rate": 0.00016307964939465914, + "loss": 0.8358, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.5763269464522621, + "learning_rate": 0.00016294546762647775, + "loss": 0.8423, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4996570666145586, + "learning_rate": 0.0001628110978935756, + "loss": 0.7754, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.5984740080839147, + "learning_rate": 0.0001626765405972011, + "loss": 0.9079, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.5767114718560038, + "learning_rate": 0.00016254179613916278, + "loss": 0.8105, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4912622950915509, + "learning_rate": 0.00016240686492182804, + "loss": 0.8452, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.7038046059310531, + "learning_rate": 0.000162271747348122, + "loss": 0.8286, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.7134046759703173, + "learning_rate": 0.0001621364438215262, + "loss": 1.0264, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.49404490870992984, + "learning_rate": 0.00016200095474607753, + "loss": 0.7835, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.623471111912179, + "learning_rate": 0.00016186528052636692, + "loss": 0.8711, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.4804767904049977, + "learning_rate": 0.0001617294215675382, + "loss": 0.8387, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.46761157040793505, + "learning_rate": 0.00016159337827528685, + "loss": 0.7845, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.7806940162867643, + "learning_rate": 0.0001614571510558588, + "loss": 0.992, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.4311619702018419, + "learning_rate": 0.00016132074031604917, + "loss": 0.8103, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.4994566659105595, + "learning_rate": 0.0001611841464632011, + "loss": 0.8189, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.5846419690789595, + "learning_rate": 0.00016104736990520468, + "loss": 0.8395, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.5866380456067939, + "learning_rate": 0.0001609104110504954, + "loss": 0.9201, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4853202162020479, + "learning_rate": 0.0001607732703080532, + "loss": 0.7618, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.7194557992744496, + "learning_rate": 0.00016063594808740113, + "loss": 0.9393, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.5960008271004776, + "learning_rate": 0.00016049844479860422, + "loss": 0.9787, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.6403500254416795, + "learning_rate": 0.00016036076085226814, + "loss": 0.9428, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.4931262065041592, + "learning_rate": 0.00016022289665953808, + "loss": 0.7831, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.6453050711874272, + "learning_rate": 0.00016008485263209742, + "loss": 0.9162, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.6674266833511433, + "learning_rate": 0.0001599466291821666, + "loss": 0.954, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.46995150980533834, + "learning_rate": 0.0001598082267225018, + "loss": 0.7899, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.480919148513137, + "learning_rate": 0.0001596696456663938, + "loss": 0.7898, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.48233714744879935, + "learning_rate": 0.0001595308864276666, + "loss": 0.768, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.5314895272602383, + "learning_rate": 0.00015939194942067646, + "loss": 0.8241, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.549652819798972, + "learning_rate": 0.0001592528350603103, + "loss": 0.9037, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.4619070622568058, + "learning_rate": 0.0001591135437619847, + "loss": 0.8204, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.5432509501817164, + "learning_rate": 0.00015897407594164467, + "loss": 0.7856, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.5875370509312653, + "learning_rate": 0.00015883443201576225, + "loss": 0.8048, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4639712654536695, + "learning_rate": 0.0001586946124013354, + "loss": 0.7795, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.4580442349982068, + "learning_rate": 0.00015855461751588677, + "loss": 0.7516, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.4346865408481033, + "learning_rate": 0.0001584144477774623, + "loss": 0.7591, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.494760556314188, + "learning_rate": 0.0001582741036046301, + "loss": 0.7833, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.528352204234871, + "learning_rate": 0.00015813358541647915, + "loss": 0.8288, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.6492585774807074, + "learning_rate": 0.00015799289363261813, + "loss": 0.9012, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.5240550560844716, + "learning_rate": 0.00015785202867317407, + "loss": 0.8712, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.5488479871941387, + "learning_rate": 0.00015771099095879108, + "loss": 0.8995, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.5192865550604814, + "learning_rate": 0.0001575697809106292, + "loss": 0.8374, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.5529828711217839, + "learning_rate": 0.00015742839895036305, + "loss": 0.8902, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.4971428712265958, + "learning_rate": 0.00015728684550018064, + "loss": 0.8137, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.5374827282262518, + "learning_rate": 0.0001571451209827821, + "loss": 0.798, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.49224084092290105, + "learning_rate": 0.00015700322582137827, + "loss": 0.7794, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.48859078374389414, + "learning_rate": 0.00015686116043968972, + "loss": 0.8465, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.58071595013088, + "learning_rate": 0.00015671892526194516, + "loss": 0.8189, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5070498492857128, + "learning_rate": 0.0001565765207128805, + "loss": 0.8166, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.45243401011492024, + "learning_rate": 0.0001564339472177373, + "loss": 0.744, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.4548224071388063, + "learning_rate": 0.00015629120520226165, + "loss": 0.7771, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.6218642511004636, + "learning_rate": 0.0001561482950927029, + "loss": 0.9506, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.5871105962110491, + "learning_rate": 0.0001560052173158123, + "loss": 0.8571, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.4502345296443168, + "learning_rate": 0.00015586197229884184, + "loss": 0.6505, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5426367653877069, + "learning_rate": 0.00015571856046954285, + "loss": 0.8341, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5270219482527734, + "learning_rate": 0.00015557498225616487, + "loss": 0.8399, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.45051557886044796, + "learning_rate": 0.0001554312380874542, + "loss": 0.7813, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4529058004894344, + "learning_rate": 0.00015528732839265272, + "loss": 0.7491, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.4491210952944165, + "learning_rate": 0.00015514325360149668, + "loss": 0.6935, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.47190486222078226, + "learning_rate": 0.0001549990141442153, + "loss": 0.8135, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.5297070644313344, + "learning_rate": 0.0001548546104515294, + "loss": 0.7777, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.4681988968090716, + "learning_rate": 0.00015471004295465035, + "loss": 0.7446, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.43319075533756046, + "learning_rate": 0.0001545653120852787, + "loss": 0.7399, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5434146752151499, + "learning_rate": 0.00015442041827560274, + "loss": 0.8054, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.563507787244446, + "learning_rate": 0.00015427536195829742, + "loss": 0.8432, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.4467567514677332, + "learning_rate": 0.00015413014356652286, + "loss": 0.7761, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.7510957755652906, + "learning_rate": 0.00015398476353392323, + "loss": 0.9545, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.5212395372222507, + "learning_rate": 0.00015383922229462549, + "loss": 0.7712, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.5480976245264051, + "learning_rate": 0.00015369352028323774, + "loss": 0.8605, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4891950203200895, + "learning_rate": 0.00015354765793484834, + "loss": 0.8298, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.6464062265699964, + "learning_rate": 0.0001534016356850244, + "loss": 0.9124, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.6372004048385579, + "learning_rate": 0.0001532554539698105, + "loss": 0.8321, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.6949770307779344, + "learning_rate": 0.00015310911322572753, + "loss": 0.8713, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.5385662142958166, + "learning_rate": 0.00015296261388977108, + "loss": 0.7753, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.4790502869005614, + "learning_rate": 0.0001528159563994104, + "loss": 0.6772, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.47222086088374976, + "learning_rate": 0.000152669141192587, + "loss": 0.7664, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.5504109345744241, + "learning_rate": 0.00015252216870771345, + "loss": 0.7948, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.555248741312122, + "learning_rate": 0.00015237503938367186, + "loss": 0.8657, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.7073240775226617, + "learning_rate": 0.00015222775365981273, + "loss": 0.9117, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.4173185557703972, + "learning_rate": 0.00015208031197595356, + "loss": 0.738, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.5281107008645449, + "learning_rate": 0.0001519327147723776, + "loss": 0.8222, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.45267513821679595, + "learning_rate": 0.00015178496248983254, + "loss": 0.7409, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.5664719511836385, + "learning_rate": 0.0001516370555695291, + "loss": 0.8596, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.5214844665016414, + "learning_rate": 0.00015148899445313981, + "loss": 0.7568, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 1.0152205592076065, + "learning_rate": 0.00015134077958279765, + "loss": 0.8269, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.4519594191860944, + "learning_rate": 0.00015119241140109467, + "loss": 0.8106, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.6341795199762441, + "learning_rate": 0.00015104389035108077, + "loss": 0.7695, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4476920087048747, + "learning_rate": 0.00015089521687626243, + "loss": 0.8549, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.7878868782854666, + "learning_rate": 0.0001507463914206012, + "loss": 0.9339, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.5918566999530537, + "learning_rate": 0.0001505974144285124, + "loss": 0.8271, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.5530858544705913, + "learning_rate": 0.000150448286344864, + "loss": 0.8463, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.509467346093617, + "learning_rate": 0.00015029900761497506, + "loss": 0.8852, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.4810367462976718, + "learning_rate": 0.00015014957868461458, + "loss": 0.775, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.555117207802824, + "learning_rate": 0.00015000000000000001, + "loss": 0.9463, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.5475186506627697, + "learning_rate": 0.000149850272007796, + "loss": 0.8014, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.549780187963014, + "learning_rate": 0.00014970039515511304, + "loss": 0.9234, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.6521370981065189, + "learning_rate": 0.00014955036988950618, + "loss": 0.9162, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.4915534290163208, + "learning_rate": 0.0001494001966589736, + "loss": 0.7963, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.5822674771779652, + "learning_rate": 0.00014924987591195547, + "loss": 0.8058, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4050047986048172, + "learning_rate": 0.00014909940809733222, + "loss": 0.6743, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.48441273420916153, + "learning_rate": 0.0001489487936644237, + "loss": 0.8787, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.453001655086232, + "learning_rate": 0.00014879803306298736, + "loss": 0.7646, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.6405269452110277, + "learning_rate": 0.00014864712674321734, + "loss": 0.875, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.544293763208351, + "learning_rate": 0.00014849607515574276, + "loss": 0.8141, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.5289104399576442, + "learning_rate": 0.00014834487875162657, + "loss": 0.8627, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.4858671702738135, + "learning_rate": 0.00014819353798236427, + "loss": 0.7289, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.46105750944172125, + "learning_rate": 0.00014804205329988225, + "loss": 0.7136, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.443082210947998, + "learning_rate": 0.00014789042515653687, + "loss": 0.7249, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.47418210192702337, + "learning_rate": 0.00014773865400511272, + "loss": 0.7509, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.45300590502898413, + "learning_rate": 0.00014758674029882152, + "loss": 0.7828, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.5426220757134201, + "learning_rate": 0.00014743468449130063, + "loss": 0.8303, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4561823438528258, + "learning_rate": 0.00014728248703661182, + "loss": 0.7711, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.5199329649530187, + "learning_rate": 0.00014713014838923976, + "loss": 0.7742, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.5638152597453153, + "learning_rate": 0.00014697766900409074, + "loss": 0.8875, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.5063226354788936, + "learning_rate": 0.00014682504933649144, + "loss": 0.6814, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.5242501097120728, + "learning_rate": 0.0001466722898421873, + "loss": 0.8227, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.5923674137806112, + "learning_rate": 0.0001465193909773413, + "loss": 0.8095, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5214849178482943, + "learning_rate": 0.00014636635319853275, + "loss": 0.8084, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.5990569040221414, + "learning_rate": 0.00014621317696275564, + "loss": 0.7951, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.5168001235866095, + "learning_rate": 0.00014605986272741748, + "loss": 0.8297, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.835047444638174, + "learning_rate": 0.00014590641095033787, + "loss": 0.7869, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.5823636977054464, + "learning_rate": 0.00014575282208974702, + "loss": 0.8818, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.44554725452601684, + "learning_rate": 0.00014559909660428468, + "loss": 0.728, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.5020753127466948, + "learning_rate": 0.00014544523495299842, + "loss": 0.7315, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.5704474888880798, + "learning_rate": 0.00014529123759534255, + "loss": 0.7228, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.5384520328041145, + "learning_rate": 0.00014513710499117647, + "loss": 0.9527, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.394233650703157, + "learning_rate": 0.0001449828376007636, + "loss": 0.7055, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.47824908759666046, + "learning_rate": 0.00014482843588476974, + "loss": 0.7723, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.5235905145789982, + "learning_rate": 0.00014467390030426186, + "loss": 0.843, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.5732810130133138, + "learning_rate": 0.0001445192313207067, + "loss": 0.9212, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.4244310157039686, + "learning_rate": 0.0001443644293959693, + "loss": 0.739, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.5124127796057848, + "learning_rate": 0.00014420949499231172, + "loss": 0.761, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5486393317506724, + "learning_rate": 0.0001440544285723915, + "loss": 0.8438, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.6586646039222367, + "learning_rate": 0.00014389923059926062, + "loss": 0.9941, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.5441001490299535, + "learning_rate": 0.0001437439015363638, + "loss": 0.8729, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.54014081332669, + "learning_rate": 0.00014358844184753712, + "loss": 0.8425, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.7196576958019154, + "learning_rate": 0.00014343285199700683, + "loss": 0.8626, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.5374306315732337, + "learning_rate": 0.0001432771324493879, + "loss": 0.796, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.45059029744056645, + "learning_rate": 0.00014312128366968243, + "loss": 0.6929, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.682780798968389, + "learning_rate": 0.00014296530612327863, + "loss": 0.835, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.5760745710423022, + "learning_rate": 0.00014280920027594907, + "loss": 0.9098, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.6632918626070273, + "learning_rate": 0.00014265296659384956, + "loss": 0.8505, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.5281403267195357, + "learning_rate": 0.00014249660554351752, + "loss": 0.809, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.5641990601207183, + "learning_rate": 0.00014234011759187083, + "loss": 0.8848, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.527565077400961, + "learning_rate": 0.00014218350320620624, + "loss": 0.7952, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.5914854708228958, + "learning_rate": 0.00014202676285419812, + "loss": 0.9428, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.5545782427824447, + "learning_rate": 0.00014186989700389687, + "loss": 0.7598, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5165600165575125, + "learning_rate": 0.0001417129061237278, + "loss": 0.8076, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.5524505528132692, + "learning_rate": 0.0001415557906824895, + "loss": 0.81, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.4404724358357859, + "learning_rate": 0.00014139855114935252, + "loss": 0.7321, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.5464096844465652, + "learning_rate": 0.00014124118799385796, + "loss": 0.863, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.596326184176317, + "learning_rate": 0.0001410837016859161, + "loss": 0.9013, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.5583953745799929, + "learning_rate": 0.00014092609269580496, + "loss": 0.825, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4659146569910716, + "learning_rate": 0.00014076836149416887, + "loss": 0.785, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.594532375525093, + "learning_rate": 0.00014061050855201723, + "loss": 0.9042, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.5711569809211288, + "learning_rate": 0.0001404525343407228, + "loss": 0.8858, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4619560652991561, + "learning_rate": 0.0001402944393320206, + "loss": 0.695, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.4537332799771394, + "learning_rate": 0.00014013622399800627, + "loss": 0.7853, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.4903248337452614, + "learning_rate": 0.00013997788881113489, + "loss": 0.7266, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.5185749863376512, + "learning_rate": 0.00013981943424421932, + "loss": 0.8424, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.5810223907787916, + "learning_rate": 0.0001396608607704289, + "loss": 0.7551, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.5175629712730113, + "learning_rate": 0.0001395021688632882, + "loss": 0.8969, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.508891684470168, + "learning_rate": 0.00013934335899667527, + "loss": 0.8234, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.43832681951349495, + "learning_rate": 0.00013918443164482046, + "loss": 0.8012, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.47721748897383354, + "learning_rate": 0.000139025387282305, + "loss": 0.6773, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.5518334509589206, + "learning_rate": 0.00013886622638405952, + "loss": 0.8515, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.5932736192719247, + "learning_rate": 0.0001387069494253626, + "loss": 0.8884, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.5016716850470819, + "learning_rate": 0.0001385475568818394, + "loss": 0.8362, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4408539969017525, + "learning_rate": 0.00013838804922946027, + "loss": 0.655, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.5532651593950209, + "learning_rate": 0.00013822842694453924, + "loss": 0.7705, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.7610366060314936, + "learning_rate": 0.0001380686905037327, + "loss": 0.8962, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.5013309116849305, + "learning_rate": 0.00013790884038403795, + "loss": 0.8293, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.6000546878969186, + "learning_rate": 0.00013774887706279165, + "loss": 0.9113, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.5347278003291565, + "learning_rate": 0.0001375888010176686, + "loss": 0.7952, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.45936417274121255, + "learning_rate": 0.00013742861272668012, + "loss": 0.7514, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.5498682094516937, + "learning_rate": 0.00013726831266817278, + "loss": 0.8453, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.431134911625742, + "learning_rate": 0.00013710790132082692, + "loss": 0.783, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.45639860323450016, + "learning_rate": 0.00013694737916365517, + "loss": 0.6993, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.4421287312564141, + "learning_rate": 0.00013678674667600102, + "loss": 0.7874, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.5869186668573526, + "learning_rate": 0.00013662600433753745, + "loss": 0.8399, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.4910762154838812, + "learning_rate": 0.00013646515262826552, + "loss": 0.8655, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.4270454417647534, + "learning_rate": 0.00013630419202851284, + "loss": 0.6779, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.5167903722991387, + "learning_rate": 0.00013614312301893223, + "loss": 0.8096, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.39608345285964125, + "learning_rate": 0.0001359819460805001, + "loss": 0.6431, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.48275630343373616, + "learning_rate": 0.00013582066169451535, + "loss": 0.7902, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.5767104270423933, + "learning_rate": 0.0001356592703425976, + "loss": 0.81, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.49965591680653026, + "learning_rate": 0.0001354977725066859, + "loss": 0.7811, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.5519634374737002, + "learning_rate": 0.00013533616866903735, + "loss": 0.7847, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.444654791469125, + "learning_rate": 0.0001351744593122255, + "loss": 0.6873, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.552838304876708, + "learning_rate": 0.00013501264491913906, + "loss": 0.8698, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.6226914419710441, + "learning_rate": 0.00013485072597298038, + "loss": 0.7971, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.4637695203653883, + "learning_rate": 0.00013468870295726398, + "loss": 0.7128, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4407982506043246, + "learning_rate": 0.0001345265763558152, + "loss": 0.7387, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.49683090767362115, + "learning_rate": 0.00013436434665276865, + "loss": 0.8292, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.41056446416752945, + "learning_rate": 0.00013420201433256689, + "loss": 0.7174, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.5436079044084973, + "learning_rate": 0.00013403957987995882, + "loss": 0.7716, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.45038021533211064, + "learning_rate": 0.00013387704377999842, + "loss": 0.7074, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.4342779506245978, + "learning_rate": 0.00013371440651804313, + "loss": 0.7158, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.49226940999672253, + "learning_rate": 0.0001335516685797525, + "loss": 0.8152, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.5220423047113417, + "learning_rate": 0.00013338883045108674, + "loss": 0.7654, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.7758953736238604, + "learning_rate": 0.00013322589261830517, + "loss": 0.8961, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3854930807897228, + "learning_rate": 0.00013306285556796495, + "loss": 0.7132, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.48707337361403186, + "learning_rate": 0.0001328997197869194, + "loss": 0.7717, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.49695349665441296, + "learning_rate": 0.0001327364857623168, + "loss": 0.7762, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4476964992522201, + "learning_rate": 0.00013257315398159864, + "loss": 0.6505, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.6202234359546465, + "learning_rate": 0.00013240972493249847, + "loss": 0.8655, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.5924779401268684, + "learning_rate": 0.0001322461991030402, + "loss": 0.9234, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.5628833346263253, + "learning_rate": 0.00013208257698153677, + "loss": 0.7812, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.5541031075439173, + "learning_rate": 0.00013191885905658872, + "loss": 0.8099, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.4432608568673146, + "learning_rate": 0.0001317550458170826, + "loss": 0.7587, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.5539590755326042, + "learning_rate": 0.00013159113775218964, + "loss": 0.8822, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.5323410127256709, + "learning_rate": 0.00013142713535136414, + "loss": 0.823, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.42560719757431825, + "learning_rate": 0.00013126303910434214, + "loss": 0.7662, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.5261584777482855, + "learning_rate": 0.00013109884950114007, + "loss": 0.7926, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.5012552941079297, + "learning_rate": 0.00013093456703205288, + "loss": 0.8183, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.5678895279740954, + "learning_rate": 0.00013077019218765305, + "loss": 0.859, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4926381674080493, + "learning_rate": 0.00013060572545878875, + "loss": 0.7717, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.7495165793130969, + "learning_rate": 0.0001304411673365826, + "loss": 0.9309, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.6307274534361385, + "learning_rate": 0.0001302765183124302, + "loss": 0.9415, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.48031720156492214, + "learning_rate": 0.00013011177887799845, + "loss": 0.8021, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.44262945926543323, + "learning_rate": 0.00012994694952522435, + "loss": 0.7721, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.4489673221358516, + "learning_rate": 0.00012978203074631334, + "loss": 0.74, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5636990162785608, + "learning_rate": 0.00012961702303373795, + "loss": 0.7533, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.6190679087459675, + "learning_rate": 0.00012945192688023624, + "loss": 0.8796, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.8275050823934263, + "learning_rate": 0.0001292867427788104, + "loss": 0.985, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.46408825605364606, + "learning_rate": 0.00012912147122272523, + "loss": 0.7622, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.45975415834022415, + "learning_rate": 0.00012895611270550666, + "loss": 0.8291, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.5169476805504977, + "learning_rate": 0.0001287906677209403, + "loss": 0.8317, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.513118763839025, + "learning_rate": 0.00012862513676307008, + "loss": 0.7682, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.447672363663369, + "learning_rate": 0.0001284595203261965, + "loss": 0.7256, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.6414550481006086, + "learning_rate": 0.00012829381890487536, + "loss": 0.7853, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.5001712911293971, + "learning_rate": 0.00012812803299391628, + "loss": 0.7769, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.5350510273310185, + "learning_rate": 0.00012796216308838117, + "loss": 0.8501, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.5960689101392795, + "learning_rate": 0.00012779620968358273, + "loss": 0.8967, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.42735189779942234, + "learning_rate": 0.00012763017327508305, + "loss": 0.7208, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.5989912309790166, + "learning_rate": 0.00012746405435869198, + "loss": 0.7989, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.40593249035056556, + "learning_rate": 0.00012729785343046588, + "loss": 0.7625, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.5900373156108883, + "learning_rate": 0.0001271315709867059, + "loss": 0.9246, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.5520195141305115, + "learning_rate": 0.00012696520752395672, + "loss": 0.8108, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.4856158173207746, + "learning_rate": 0.00012679876353900482, + "loss": 0.7981, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.5018996299240422, + "learning_rate": 0.00012663223952887723, + "loss": 0.8017, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.44592198162703134, + "learning_rate": 0.00012646563599083996, + "loss": 0.7587, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.5851073621552025, + "learning_rate": 0.00012629895342239643, + "loss": 0.8203, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.6022549792651677, + "learning_rate": 0.00012613219232128608, + "loss": 0.7525, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.418459635990812, + "learning_rate": 0.00012596535318548289, + "loss": 0.699, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.5013376005889809, + "learning_rate": 0.0001257984365131938, + "loss": 0.8398, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.6079407284267303, + "learning_rate": 0.00012563144280285741, + "loss": 0.8777, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.4257371313410328, + "learning_rate": 0.00012546437255314222, + "loss": 0.7593, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.6591808043227458, + "learning_rate": 0.0001252972262629454, + "loss": 0.9076, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.6802700917475603, + "learning_rate": 0.00012513000443139112, + "loss": 0.8756, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.5181596294153098, + "learning_rate": 0.00012496270755782914, + "loss": 0.8258, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.5981191712702367, + "learning_rate": 0.00012479533614183334, + "loss": 0.8298, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4734819007822332, + "learning_rate": 0.00012462789068320017, + "loss": 0.6754, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.5499751224532713, + "learning_rate": 0.00012446037168194714, + "loss": 0.8376, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.8101400981147132, + "learning_rate": 0.00012429277963831148, + "loss": 0.8358, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.4981836652220473, + "learning_rate": 0.00012412511505274844, + "loss": 0.8621, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.6068909278602164, + "learning_rate": 0.00012395737842592995, + "loss": 0.8989, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.5063610479063229, + "learning_rate": 0.000123789570258743, + "loss": 0.8067, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.6150433634808093, + "learning_rate": 0.00012362169105228826, + "loss": 0.7889, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.6595179600986578, + "learning_rate": 0.00012345374130787854, + "loss": 0.9369, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.5242205074387309, + "learning_rate": 0.00012328572152703725, + "loss": 0.8035, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5249171801917747, + "learning_rate": 0.000123117632211497, + "loss": 0.7271, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.525253689840259, + "learning_rate": 0.00012294947386319794, + "loss": 0.8092, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.5148719131426918, + "learning_rate": 0.0001227812469842864, + "loss": 0.7503, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 2.263188923952998, + "learning_rate": 0.00012261295207711346, + "loss": 0.814, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.576438018451827, + "learning_rate": 0.00012244458964423327, + "loss": 0.8351, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.6799800154645094, + "learning_rate": 0.00012227616018840154, + "loss": 0.8366, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.48282807516787113, + "learning_rate": 0.0001221076642125742, + "loss": 0.6809, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.5971682835989159, + "learning_rate": 0.00012193910221990581, + "loss": 0.8206, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.4830829692501156, + "learning_rate": 0.00012177047471374807, + "loss": 0.7988, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.48624698153396156, + "learning_rate": 0.00012160178219764837, + "loss": 0.7756, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.5583476779499136, + "learning_rate": 0.0001214330251753481, + "loss": 0.8014, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.5121053928194806, + "learning_rate": 0.00012126420415078132, + "loss": 0.8634, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.5109827941220455, + "learning_rate": 0.00012109531962807332, + "loss": 0.8331, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.45922568949074166, + "learning_rate": 0.00012092637211153885, + "loss": 0.7878, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.491968494299683, + "learning_rate": 0.0001207573621056809, + "loss": 0.7838, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.5340310158148845, + "learning_rate": 0.00012058829011518896, + "loss": 0.8843, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.5213494917538501, + "learning_rate": 0.00012041915664493761, + "loss": 0.9101, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.3655596990339082, + "learning_rate": 0.00012024996219998517, + "loss": 0.7014, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.4226746379320319, + "learning_rate": 0.00012008070728557186, + "loss": 0.7779, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.431341435554287, + "learning_rate": 0.00011991139240711857, + "loss": 0.8037, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.47820557918463213, + "learning_rate": 0.00011974201807022525, + "loss": 0.7338, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4320850528449281, + "learning_rate": 0.00011957258478066931, + "loss": 0.7034, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.5121373761865688, + "learning_rate": 0.00011940309304440433, + "loss": 0.785, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.5195334690582885, + "learning_rate": 0.00011923354336755835, + "loss": 0.8547, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.47214121557243605, + "learning_rate": 0.00011906393625643244, + "loss": 0.764, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.4517409213750945, + "learning_rate": 0.00011889427221749916, + "loss": 0.7222, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.487304930266377, + "learning_rate": 0.00011872455175740112, + "loss": 0.7959, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4557153863646574, + "learning_rate": 0.00011855477538294935, + "loss": 0.8079, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.3929571758539548, + "learning_rate": 0.00011838494360112185, + "loss": 0.7375, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.6326251573267269, + "learning_rate": 0.00011821505691906216, + "loss": 0.9481, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4990926600937626, + "learning_rate": 0.00011804511584407763, + "loss": 0.7798, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.5934156452636286, + "learning_rate": 0.00011787512088363817, + "loss": 0.7484, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.4278808297111898, + "learning_rate": 0.00011770507254537453, + "loss": 0.6593, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.536243830039993, + "learning_rate": 0.00011753497133707679, + "loss": 0.8349, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.46465540209464534, + "learning_rate": 0.00011736481776669306, + "loss": 0.7015, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.433642846301004, + "learning_rate": 0.00011719461234232764, + "loss": 0.7143, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.42269971011810403, + "learning_rate": 0.00011702435557223987, + "loss": 0.7105, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.4847553733063033, + "learning_rate": 0.00011685404796484225, + "loss": 0.6555, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.5140368262100883, + "learning_rate": 0.00011668369002869912, + "loss": 0.8074, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.7552403345216738, + "learning_rate": 0.00011651328227252517, + "loss": 0.8453, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.56376967482142, + "learning_rate": 0.00011634282520518383, + "loss": 0.8418, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.5851998857854613, + "learning_rate": 0.00011617231933568578, + "loss": 0.85, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4105678044595301, + "learning_rate": 0.00011600176517318741, + "loss": 0.7372, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.6190898553898988, + "learning_rate": 0.00011583116322698935, + "loss": 0.8528, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.6340439413696632, + "learning_rate": 0.00011566051400653486, + "loss": 0.8552, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4871600449223175, + "learning_rate": 0.00011548981802140848, + "loss": 0.737, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.46543459690668115, + "learning_rate": 0.00011531907578133429, + "loss": 0.7621, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.45676040032070964, + "learning_rate": 0.00011514828779617459, + "loss": 0.6521, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4444505690264811, + "learning_rate": 0.00011497745457592816, + "loss": 0.7469, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.4316611368918944, + "learning_rate": 0.00011480657663072896, + "loss": 0.6903, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.48348628347312234, + "learning_rate": 0.00011463565447084445, + "loss": 0.7634, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.5651308913118273, + "learning_rate": 0.00011446468860667421, + "loss": 0.8512, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.4956568675602372, + "learning_rate": 0.00011429367954874819, + "loss": 0.7371, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.5711445179616791, + "learning_rate": 0.0001141226278077254, + "loss": 0.793, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.49332492360055186, + "learning_rate": 0.00011395153389439233, + "loss": 0.7517, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.43059417980440506, + "learning_rate": 0.00011378039831966134, + "loss": 0.6782, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.515807728343383, + "learning_rate": 0.00011360922159456928, + "loss": 0.7495, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6139067016998846, + "learning_rate": 0.00011343800423027582, + "loss": 0.7653, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.5228184194482335, + "learning_rate": 0.00011326674673806195, + "loss": 0.8436, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.45164086282048727, + "learning_rate": 0.00011309544962932862, + "loss": 0.8154, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.49664338982240125, + "learning_rate": 0.0001129241134155949, + "loss": 0.7513, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.48084906692108864, + "learning_rate": 0.00011275273860849684, + "loss": 0.7482, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.551448897376019, + "learning_rate": 0.00011258132571978555, + "loss": 0.8216, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4445395746743038, + "learning_rate": 0.00011240987526132594, + "loss": 0.7074, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.5620772014856222, + "learning_rate": 0.00011223838774509514, + "loss": 0.8429, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.4909096300506755, + "learning_rate": 0.00011206686368318086, + "loss": 0.7787, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4681599585765567, + "learning_rate": 0.00011189530358778005, + "loss": 0.7866, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.5543045236010434, + "learning_rate": 0.00011172370797119712, + "loss": 0.8084, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.45408125652307624, + "learning_rate": 0.00011155207734584263, + "loss": 0.7466, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.46233535011485677, + "learning_rate": 0.00011138041222423177, + "loss": 0.7628, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.47013186928167217, + "learning_rate": 0.00011120871311898254, + "loss": 0.8193, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.5887968982013919, + "learning_rate": 0.0001110369805428146, + "loss": 0.9226, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.695042367869921, + "learning_rate": 0.00011086521500854745, + "loss": 0.8255, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.4546170412410129, + "learning_rate": 0.0001106934170290991, + "loss": 0.7532, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.5002388839330394, + "learning_rate": 0.00011052158711748434, + "loss": 0.7211, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.46112879552081704, + "learning_rate": 0.00011034972578681338, + "loss": 0.761, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.7208674165988788, + "learning_rate": 0.00011017783355029026, + "loss": 0.7312, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.4663813215399649, + "learning_rate": 0.00011000591092121127, + "loss": 0.7875, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.4829297395816714, + "learning_rate": 0.00010983395841296348, + "loss": 0.7529, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.4493307430244323, + "learning_rate": 0.0001096619765390232, + "loss": 0.7081, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.44985051226127615, + "learning_rate": 0.00010948996581295436, + "loss": 0.6445, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5586032923358022, + "learning_rate": 0.00010931792674840718, + "loss": 0.8904, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.41189700186726963, + "learning_rate": 0.00010914585985911632, + "loss": 0.7298, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.48701202749098055, + "learning_rate": 0.00010897376565889971, + "loss": 0.6895, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.5102273322747214, + "learning_rate": 0.00010880164466165674, + "loss": 0.8056, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.5078299674739979, + "learning_rate": 0.00010862949738136681, + "loss": 0.7926, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.3976489366799753, + "learning_rate": 0.00010845732433208779, + "loss": 0.6567, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.505189628624692, + "learning_rate": 0.00010828512602795462, + "loss": 0.7802, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.4651704738807796, + "learning_rate": 0.00010811290298317755, + "loss": 0.783, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.5943459424529413, + "learning_rate": 0.00010794065571204072, + "loss": 0.7674, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.38255583421035383, + "learning_rate": 0.00010776838472890065, + "loss": 0.6984, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.669499610975014, + "learning_rate": 0.00010759609054818458, + "loss": 0.8616, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.6167405918055653, + "learning_rate": 0.00010742377368438914, + "loss": 0.7848, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.6810199151029439, + "learning_rate": 0.00010725143465207867, + "loss": 0.8911, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.4851207421034525, + "learning_rate": 0.00010707907396588361, + "loss": 0.7019, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.4675091668445895, + "learning_rate": 0.0001069066921404992, + "loss": 0.7666, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.4372682694720911, + "learning_rate": 0.00010673428969068364, + "loss": 0.8016, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.5569363401589187, + "learning_rate": 0.00010656186713125689, + "loss": 0.9226, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.4437524237523622, + "learning_rate": 0.0001063894249770989, + "loss": 0.7747, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.38670304289499796, + "learning_rate": 0.00010621696374314807, + "loss": 0.6455, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.4390419147479167, + "learning_rate": 0.00010604448394439983, + "loss": 0.7476, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.48671406938555606, + "learning_rate": 0.00010587198609590505, + "loss": 0.7387, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.452709425015621, + "learning_rate": 0.00010569947071276847, + "loss": 0.7399, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.48169458759755385, + "learning_rate": 0.00010552693831014726, + "loss": 0.7716, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.5810191696791625, + "learning_rate": 0.0001053543894032493, + "loss": 0.8712, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5765581151091413, + "learning_rate": 0.00010518182450733186, + "loss": 0.8819, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.41820338774986837, + "learning_rate": 0.00010500924413769988, + "loss": 0.682, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.46426557902311427, + "learning_rate": 0.00010483664880970457, + "loss": 0.7801, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4952533035100198, + "learning_rate": 0.00010466403903874176, + "loss": 0.7979, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.4688203344646727, + "learning_rate": 0.00010449141534025045, + "loss": 0.7134, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.43481292002665417, + "learning_rate": 0.00010431877822971117, + "loss": 0.7983, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.6569565722752031, + "learning_rate": 0.00010414612822264455, + "loss": 0.7516, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.5597575663379816, + "learning_rate": 0.00010397346583460971, + "loss": 0.8148, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.6499924668627625, + "learning_rate": 0.0001038007915812028, + "loss": 0.8307, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.5532966267081787, + "learning_rate": 0.00010362810597805526, + "loss": 0.8149, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.5472775181001878, + "learning_rate": 0.0001034554095408326, + "loss": 0.8534, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.491042510513382, + "learning_rate": 0.00010328270278523256, + "loss": 0.8242, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.5041185848643801, + "learning_rate": 0.0001031099862269837, + "loss": 0.7893, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.5474803994533827, + "learning_rate": 0.00010293726038184393, + "loss": 0.749, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.35925183829918744, + "learning_rate": 0.00010276452576559879, + "loss": 0.6674, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.48099690864579764, + "learning_rate": 0.00010259178289406011, + "loss": 0.7598, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.7751380156755314, + "learning_rate": 0.00010241903228306431, + "loss": 0.8022, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.49938410552595913, + "learning_rate": 0.0001022462744484709, + "loss": 0.7422, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4882321906277085, + "learning_rate": 0.00010207350990616107, + "loss": 0.8031, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.45454582469219645, + "learning_rate": 0.00010190073917203589, + "loss": 0.7319, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.4934886271469217, + "learning_rate": 0.00010172796276201503, + "loss": 0.8239, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.5424838119054968, + "learning_rate": 0.0001015551811920351, + "loss": 0.8179, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.5110543967634006, + "learning_rate": 0.00010138239497804804, + "loss": 0.8246, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.5787063388849131, + "learning_rate": 0.00010120960463601976, + "loss": 0.7412, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.5794064281337606, + "learning_rate": 0.00010103681068192845, + "loss": 0.806, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.492692541135898, + "learning_rate": 0.00010086401363176305, + "loss": 0.7348, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.46714017449595235, + "learning_rate": 0.00010069121400152181, + "loss": 0.7117, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.5771394507869333, + "learning_rate": 0.00010051841230721065, + "loss": 0.8032, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.5660541739325817, + "learning_rate": 0.0001003456090648416, + "loss": 0.8302, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.5025679783937408, + "learning_rate": 0.00010017280479043147, + "loss": 0.7271, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.5283901296934519, + "learning_rate": 0.0001, + "loss": 0.7609, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.4821874746709882, + "learning_rate": 9.982719520956855e-05, + "loss": 0.8704, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.5357850660811156, + "learning_rate": 9.965439093515841e-05, + "loss": 0.7379, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.525442476682316, + "learning_rate": 9.948158769278939e-05, + "loss": 0.708, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.3962009188355929, + "learning_rate": 9.930878599847821e-05, + "loss": 0.6954, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.5389712448135058, + "learning_rate": 9.913598636823693e-05, + "loss": 0.798, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3881918146298721, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6047, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.5315277531548152, + "learning_rate": 9.879039536398024e-05, + "loss": 0.8672, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.396419317534482, + "learning_rate": 9.861760502195197e-05, + "loss": 0.6909, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.4806724454908013, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7973, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.4697484761739775, + "learning_rate": 9.827203723798498e-05, + "loss": 0.7394, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.5160014179048552, + "learning_rate": 9.809926082796415e-05, + "loss": 0.832, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.591745687809906, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7554, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.4651574748926571, + "learning_rate": 9.775372555152912e-05, + "loss": 0.6775, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.6072165626333691, + "learning_rate": 9.758096771693573e-05, + "loss": 0.9198, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.5195555651371119, + "learning_rate": 9.740821710593989e-05, + "loss": 0.8281, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.5442537680604943, + "learning_rate": 9.723547423440122e-05, + "loss": 0.8562, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.6341924868794666, + "learning_rate": 9.70627396181561e-05, + "loss": 0.9075, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.5003976190088961, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7935, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.43780679774099346, + "learning_rate": 9.671729721476746e-05, + "loss": 0.6798, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.6084254517867603, + "learning_rate": 9.654459045916743e-05, + "loss": 0.8155, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.5195380244556218, + "learning_rate": 9.637189402194476e-05, + "loss": 0.8254, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.5814706420425718, + "learning_rate": 9.619920841879725e-05, + "loss": 0.8689, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.563914936612008, + "learning_rate": 9.602653416539031e-05, + "loss": 0.7789, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.4711340862213265, + "learning_rate": 9.585387177735547e-05, + "loss": 0.759, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.59213774648018, + "learning_rate": 9.568122177028884e-05, + "loss": 0.8744, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.6243405291520489, + "learning_rate": 9.550858465974958e-05, + "loss": 0.8354, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.5992797953079083, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7935, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.41773210721308607, + "learning_rate": 9.516335119029546e-05, + "loss": 0.779, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.5192755859477955, + "learning_rate": 9.499075586230013e-05, + "loss": 0.8225, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5140188987129515, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7594, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.4609510429287488, + "learning_rate": 9.464561059675073e-05, + "loss": 0.7799, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.5358185906609553, + "learning_rate": 9.44730616898528e-05, + "loss": 0.862, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.574451280567273, + "learning_rate": 9.430052928723153e-05, + "loss": 0.8422, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.42928921805115855, + "learning_rate": 9.412801390409497e-05, + "loss": 0.7623, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.7272438157788303, + "learning_rate": 9.395551605560018e-05, + "loss": 0.9059, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.5559463653954599, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7792, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.4313360808757448, + "learning_rate": 9.361057502290113e-05, + "loss": 0.7717, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.47030954524984847, + "learning_rate": 9.343813286874312e-05, + "loss": 0.757, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.442738354497151, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7434, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.5667853061205357, + "learning_rate": 9.309330785950086e-05, + "loss": 0.8977, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.41132253844073, + "learning_rate": 9.292092603411641e-05, + "loss": 0.6707, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.7032423662806179, + "learning_rate": 9.274856534792138e-05, + "loss": 0.84, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.48968412442626585, + "learning_rate": 9.257622631561085e-05, + "loss": 0.7559, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.5422782601866228, + "learning_rate": 9.240390945181543e-05, + "loss": 0.721, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.6473133424013873, + "learning_rate": 9.223161527109937e-05, + "loss": 0.9128, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.5439588271027264, + "learning_rate": 9.205934428795929e-05, + "loss": 0.7603, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.4016782313446635, + "learning_rate": 9.188709701682247e-05, + "loss": 0.6788, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5068484930555268, + "learning_rate": 9.171487397204539e-05, + "loss": 0.888, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.5145274137351125, + "learning_rate": 9.154267566791223e-05, + "loss": 0.7793, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.6751853358091369, + "learning_rate": 9.137050261863324e-05, + "loss": 0.8471, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.5064023549791228, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7535, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.5281939437096247, + "learning_rate": 9.102623434110028e-05, + "loss": 0.7673, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.4409058470768508, + "learning_rate": 9.085414014088369e-05, + "loss": 0.6555, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.4318211050051625, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7061, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.4423890277498593, + "learning_rate": 9.051003418704565e-05, + "loss": 0.7312, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.5308839756869687, + "learning_rate": 9.033802346097682e-05, + "loss": 0.8195, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.5027576313133176, + "learning_rate": 9.016604158703654e-05, + "loss": 0.8071, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.4731949786966609, + "learning_rate": 8.999408907878877e-05, + "loss": 0.7497, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.5183385982817474, + "learning_rate": 8.982216644970979e-05, + "loss": 0.6428, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5327655953051283, + "learning_rate": 8.965027421318665e-05, + "loss": 0.8299, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.5252812689417474, + "learning_rate": 8.947841288251568e-05, + "loss": 0.7438, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.4854605612544107, + "learning_rate": 8.930658297090091e-05, + "loss": 0.7781, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.47055365993939163, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6582, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.5560444704582403, + "learning_rate": 8.896301945718541e-05, + "loss": 0.8353, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.41571277834164605, + "learning_rate": 8.879128688101749e-05, + "loss": 0.6936, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4371639418042648, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6978, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.6722608795134971, + "learning_rate": 8.844792265415738e-05, + "loss": 0.8802, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.4970904267426464, + "learning_rate": 8.827629202880293e-05, + "loss": 0.7537, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.5680397577452019, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7023, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.6607651748179223, + "learning_rate": 8.793313631681915e-05, + "loss": 0.8185, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.4320010725813437, + "learning_rate": 8.776161225490489e-05, + "loss": 0.7247, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4379851357731281, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7357, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.5864123812194793, + "learning_rate": 8.741867428021446e-05, + "loss": 0.9109, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.42591442989357187, + "learning_rate": 8.724726139150318e-05, + "loss": 0.673, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.5692366538803478, + "learning_rate": 8.707588658440511e-05, + "loss": 0.8681, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.5146718413776341, + "learning_rate": 8.690455037067141e-05, + "loss": 0.7601, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.6637949998918076, + "learning_rate": 8.673325326193806e-05, + "loss": 0.8332, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.542905102982066, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7672, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.6532562664979465, + "learning_rate": 8.639077840543077e-05, + "loss": 0.8629, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.4526633920079731, + "learning_rate": 8.621960168033867e-05, + "loss": 0.7665, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.5367334612585443, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7661, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.5950768846379804, + "learning_rate": 8.587737219227462e-05, + "loss": 0.7678, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.4301047695545847, + "learning_rate": 8.570632045125185e-05, + "loss": 0.6559, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.7484350038467311, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7962, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.41575632022589987, + "learning_rate": 8.536434552915556e-05, + "loss": 0.6308, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.4393636084782435, + "learning_rate": 8.519342336927105e-05, + "loss": 0.7342, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.41562609846030607, + "learning_rate": 8.502254542407186e-05, + "loss": 0.726, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.4477045164078837, + "learning_rate": 8.485171220382545e-05, + "loss": 0.6865, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.486071042296672, + "learning_rate": 8.468092421866573e-05, + "loss": 0.8573, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4665545159433152, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7984, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.5524757999976719, + "learning_rate": 8.433948599346516e-05, + "loss": 0.8859, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.45394390731347345, + "learning_rate": 8.416883677301069e-05, + "loss": 0.8273, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.8670666883863559, + "learning_rate": 8.399823482681262e-05, + "loss": 0.9476, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.45014162704566263, + "learning_rate": 8.382768066431425e-05, + "loss": 0.7477, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.47423961273501625, + "learning_rate": 8.36571747948162e-05, + "loss": 0.725, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4950365048983719, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7586, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.5029439088135188, + "learning_rate": 8.33163099713009e-05, + "loss": 0.7981, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.5897608031444519, + "learning_rate": 8.31459520351578e-05, + "loss": 0.9008, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.6525085166965572, + "learning_rate": 8.297564442776014e-05, + "loss": 0.8962, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.5765106103125884, + "learning_rate": 8.280538765767235e-05, + "loss": 0.882, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.5661404452293268, + "learning_rate": 8.263518223330697e-05, + "loss": 0.8775, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4399109717922037, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6821, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.515553035736288, + "learning_rate": 8.22949274546255e-05, + "loss": 0.7302, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.5546251007029763, + "learning_rate": 8.212487911636184e-05, + "loss": 0.8414, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.36818646059975035, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6362, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.5973642474673283, + "learning_rate": 8.178494308093789e-05, + "loss": 0.8576, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.5453914955495656, + "learning_rate": 8.161505639887817e-05, + "loss": 0.8003, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4538127419270415, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7232, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.5145454434088483, + "learning_rate": 8.127544824259889e-05, + "loss": 0.8297, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.457613168791868, + "learning_rate": 8.110572778250085e-05, + "loss": 0.7314, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.529026255615779, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7671, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.5276732011135837, + "learning_rate": 8.076645663244168e-05, + "loss": 0.7988, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.5119811476084692, + "learning_rate": 8.059690695559568e-05, + "loss": 0.8729, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.5669022170618133, + "learning_rate": 8.042741521933071e-05, + "loss": 0.838, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.47221332942467154, + "learning_rate": 8.025798192977481e-05, + "loss": 0.7096, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.4163881284245536, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6801, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.5010530863802051, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7289, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.48392985830710034, + "learning_rate": 7.975003780001485e-05, + "loss": 0.6489, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.4814123674902523, + "learning_rate": 7.958084335506239e-05, + "loss": 0.7967, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.45439283152827153, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7022, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.5160712883620804, + "learning_rate": 7.924263789431912e-05, + "loss": 0.7722, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.4084949839651646, + "learning_rate": 7.907362788846116e-05, + "loss": 0.7663, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.48014982117628685, + "learning_rate": 7.89046803719267e-05, + "loss": 0.8165, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.4281813822547069, + "learning_rate": 7.873579584921869e-05, + "loss": 0.6895, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.5134186078998622, + "learning_rate": 7.856697482465196e-05, + "loss": 0.7335, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.6060821542053557, + "learning_rate": 7.839821780235168e-05, + "loss": 0.952, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.5428771989367646, + "learning_rate": 7.822952528625191e-05, + "loss": 0.7522, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.5649056681978917, + "learning_rate": 7.806089778009421e-05, + "loss": 0.785, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.5410584645799422, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7819, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.5297779787392666, + "learning_rate": 7.772383981159849e-05, + "loss": 0.7827, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.4981974984077256, + "learning_rate": 7.755541035576677e-05, + "loss": 0.7938, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.43965228584236327, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7716, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.46078442248990276, + "learning_rate": 7.721875301571359e-05, + "loss": 0.7952, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.47099705722870444, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6965, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.6014987937081797, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7589, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.46239459077607387, + "learning_rate": 7.671427847296275e-05, + "loss": 0.7276, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.4697286751474339, + "learning_rate": 7.654625869212146e-05, + "loss": 0.7152, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.5210272405996849, + "learning_rate": 7.637830894771175e-05, + "loss": 0.8268, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.5118511288541373, + "learning_rate": 7.6210429741257e-05, + "loss": 0.7668, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.4711383662788997, + "learning_rate": 7.604262157407007e-05, + "loss": 0.8149, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.46408282702945824, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7562, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.5394146489399196, + "learning_rate": 7.570722036168854e-05, + "loss": 0.7958, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.6869150004348163, + "learning_rate": 7.55396283180529e-05, + "loss": 0.7718, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.49055454254608344, + "learning_rate": 7.537210931679987e-05, + "loss": 0.758, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.4458991499930382, + "learning_rate": 7.520466385816671e-05, + "loss": 0.672, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.4347672269064152, + "learning_rate": 7.503729244217086e-05, + "loss": 0.6597, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.493692750523553, + "learning_rate": 7.48699955686089e-05, + "loss": 0.828, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.5427876490200283, + "learning_rate": 7.470277373705461e-05, + "loss": 0.8172, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.49025422986820805, + "learning_rate": 7.453562744685778e-05, + "loss": 0.7671, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.6049657915730587, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6777, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.4819646779616934, + "learning_rate": 7.42015634868062e-05, + "loss": 0.7578, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.5235721200251294, + "learning_rate": 7.403464681451715e-05, + "loss": 0.8077, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.6087209952321163, + "learning_rate": 7.386780767871397e-05, + "loss": 0.8524, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.4500483929554947, + "learning_rate": 7.370104657760361e-05, + "loss": 0.7262, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.45807234060644647, + "learning_rate": 7.353436400916004e-05, + "loss": 0.6825, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4129702576367196, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7258, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.4403723919576854, + "learning_rate": 7.320123646099519e-05, + "loss": 0.6541, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.6009178369133598, + "learning_rate": 7.303479247604332e-05, + "loss": 0.8899, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.5239256047852181, + "learning_rate": 7.286842901329412e-05, + "loss": 0.8046, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.46279889651377026, + "learning_rate": 7.270214656953415e-05, + "loss": 0.631, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.5501400019892834, + "learning_rate": 7.253594564130804e-05, + "loss": 0.8407, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4602923749840518, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7225, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.5463141230994777, + "learning_rate": 7.22037903164173e-05, + "loss": 0.7215, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.5693472216906276, + "learning_rate": 7.203783691161883e-05, + "loss": 0.8745, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.41613892010850634, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6888, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.577122066529536, + "learning_rate": 7.170618109512465e-05, + "loss": 0.7882, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.46998332690172934, + "learning_rate": 7.154047967380354e-05, + "loss": 0.8013, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.4383546462072333, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6924, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.44865404369550144, + "learning_rate": 7.12093322790597e-05, + "loss": 0.7643, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.5925515225657153, + "learning_rate": 7.104388729449338e-05, + "loss": 0.9031, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.4257641971014278, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7118, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.5206866535828725, + "learning_rate": 7.071325722118963e-05, + "loss": 0.7415, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.5707751370071729, + "learning_rate": 7.054807311976379e-05, + "loss": 0.7437, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.40524442168153607, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6977, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.49142407085977796, + "learning_rate": 7.021796925368667e-05, + "loss": 0.8295, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.5282617440640158, + "learning_rate": 7.005305047477566e-05, + "loss": 0.7301, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.5451141058517067, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7744, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.40934033588093743, + "learning_rate": 6.972348168756983e-05, + "loss": 0.6475, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.48871504711122515, + "learning_rate": 6.955883266341741e-05, + "loss": 0.6969, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5172307509140853, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7246, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.5100529082891403, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7351, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.5249875756447643, + "learning_rate": 6.906543296794714e-05, + "loss": 0.7637, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.47400722879437185, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6943, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.45904504089238257, + "learning_rate": 6.873696089565786e-05, + "loss": 0.7526, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.4934751039540469, + "learning_rate": 6.85728646486359e-05, + "loss": 0.727, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.46594305801411295, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7175, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.5510438341683022, + "learning_rate": 6.82449541829174e-05, + "loss": 0.7947, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.5352113629485974, + "learning_rate": 6.80811409434113e-05, + "loss": 0.8727, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.48080489649554553, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7542, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.5130889735164389, + "learning_rate": 6.775380089695986e-05, + "loss": 0.7984, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.5169453233769978, + "learning_rate": 6.759027506750158e-05, + "loss": 0.6668, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.6683968572616635, + "learning_rate": 6.742684601840141e-05, + "loss": 0.868, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.513319017914739, + "learning_rate": 6.726351423768322e-05, + "loss": 0.7778, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.50500207550488, + "learning_rate": 6.710028021308061e-05, + "loss": 0.8149, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.6871309726289309, + "learning_rate": 6.693714443203507e-05, + "loss": 0.8847, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.4828761250518926, + "learning_rate": 6.677410738169485e-05, + "loss": 0.8028, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.5402832248712572, + "learning_rate": 6.661116954891328e-05, + "loss": 0.7615, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.428383779948302, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7426, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.4661800045858397, + "learning_rate": 6.62855934819569e-05, + "loss": 0.7363, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.4757002484645323, + "learning_rate": 6.612295622000162e-05, + "loss": 0.7165, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.5891581050601145, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7801, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.5877017442395175, + "learning_rate": 6.579798566743314e-05, + "loss": 0.8208, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.4374020804860988, + "learning_rate": 6.563565334723134e-05, + "loss": 0.7092, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.7680263524757133, + "learning_rate": 6.547342364418481e-05, + "loss": 0.8502, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 1.131632589727481, + "learning_rate": 6.531129704273604e-05, + "loss": 0.8092, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.43927601534850325, + "learning_rate": 6.514927402701964e-05, + "loss": 0.8155, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.6044194834712543, + "learning_rate": 6.498735508086093e-05, + "loss": 0.8709, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.6638143282370651, + "learning_rate": 6.48255406877745e-05, + "loss": 0.8182, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.41295433463974385, + "learning_rate": 6.466383133096267e-05, + "loss": 0.7358, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4185388022872638, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6662, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.5425672199599714, + "learning_rate": 6.434072965740242e-05, + "loss": 0.7387, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.48854563428955444, + "learning_rate": 6.417933830548467e-05, + "loss": 0.8425, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.43279089007911115, + "learning_rate": 6.40180539194999e-05, + "loss": 0.644, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.4624374296243218, + "learning_rate": 6.385687698106781e-05, + "loss": 0.7802, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.4287093400356675, + "learning_rate": 6.369580797148718e-05, + "loss": 0.698, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.43881856116583695, + "learning_rate": 6.35348473717345e-05, + "loss": 0.677, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.4724279515832686, + "learning_rate": 6.337399566246257e-05, + "loss": 0.8122, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.4632779979930065, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6816, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.41653471994783514, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6731, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.4041036126580989, + "learning_rate": 6.289209867917312e-05, + "loss": 0.6171, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.4244669309942605, + "learning_rate": 6.273168733182722e-05, + "loss": 0.6509, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4480670699233299, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6782, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.5147817666607787, + "learning_rate": 6.241119898233144e-05, + "loss": 0.7843, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.6590523887277467, + "learning_rate": 6.225112293720836e-05, + "loss": 0.6622, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.5906784821557615, + "learning_rate": 6.209115961596208e-05, + "loss": 0.8182, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.79123920087716, + "learning_rate": 6.19313094962673e-05, + "loss": 0.7645, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.4934105295980472, + "learning_rate": 6.177157305546078e-05, + "loss": 0.6993, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.5169576112242033, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7903, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.5951619801235911, + "learning_rate": 6.145244311816063e-05, + "loss": 0.8015, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.6270626637751038, + "learning_rate": 6.129305057463741e-05, + "loss": 0.8664, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.5411196988348133, + "learning_rate": 6.113377361594049e-05, + "loss": 0.8788, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.5145499844752968, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.7333, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.45312931845329213, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.7871, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.479732080413242, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7336, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.5463357924719869, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.744, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.5279290163752073, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.7531, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.49836728800476254, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7327, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.4971802403963867, + "learning_rate": 6.002211118886514e-05, + "loss": 0.7987, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.48463827627897155, + "learning_rate": 5.986377600199371e-05, + "loss": 0.7463, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5098697199096175, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6947, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.5234828393057811, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.7572, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.4458409726885109, + "learning_rate": 5.938949144798279e-05, + "loss": 0.7151, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.46498815645957925, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7341, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.48012200293932256, + "learning_rate": 5.907390730419507e-05, + "loss": 0.7592, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.5805583747116773, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.8637, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4433313879267007, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6725, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.5194855571239937, + "learning_rate": 5.860144885064751e-05, + "loss": 0.776, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.6018279111933093, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.8028, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.4240158310913475, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6878, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.5762406227870703, + "learning_rate": 5.813010299610313e-05, + "loss": 0.8156, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.457905008831502, + "learning_rate": 5.797323714580192e-05, + "loss": 0.7923, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4932329588950252, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6956, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.49378929371510405, + "learning_rate": 5.765988240812921e-05, + "loss": 0.7152, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.6409762670531869, + "learning_rate": 5.750339445648252e-05, + "loss": 0.766, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.5376631985987848, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7525, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.5240791531669154, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.8279, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.44664839730064054, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.78, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4039073575383679, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6781, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.450923506585266, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.8058, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.48897683906804906, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.7291, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.4808742667856329, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7342, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.6253985573620412, + "learning_rate": 5.625609846363622e-05, + "loss": 0.8818, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.4846287892781986, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.6947, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.478933595678575, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.8098, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.4943540669969761, + "learning_rate": 5.579050500768836e-05, + "loss": 0.7402, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.5985943815299349, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.7581, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.5936083742537585, + "learning_rate": 5.54807686792933e-05, + "loss": 0.8027, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.4802903816045552, + "learning_rate": 5.53260996957381e-05, + "loss": 0.7532, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.47401679138024017, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.7888, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5245939430829126, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6565, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.6094963923285219, + "learning_rate": 5.486289500882355e-05, + "loss": 0.761, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.5166758468287893, + "learning_rate": 5.47087624046575e-05, + "loss": 0.7691, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.5986893700296746, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.8324, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.5608641556504018, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.744, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.43588479094760113, + "learning_rate": 5.424717791025302e-05, + "loss": 0.7413, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.5047332229040048, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7423, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 1.4880627722155477, + "learning_rate": 5.394013727258254e-05, + "loss": 0.7006, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.453024387315063, + "learning_rate": 5.378682303724435e-05, + "loss": 0.6529, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.5195667651892474, + "learning_rate": 5.363364680146725e-05, + "loss": 0.8618, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.427840085987605, + "learning_rate": 5.348060902265871e-05, + "loss": 0.6182, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.6190438453635412, + "learning_rate": 5.332771015781275e-05, + "loss": 0.8217, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.49760488405259695, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7374, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.4331316786801372, + "learning_rate": 5.302233099590928e-05, + "loss": 0.6793, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5556857259709147, + "learning_rate": 5.286985161076029e-05, + "loss": 0.8043, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.48880450793228364, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7714, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.48399177289629297, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.6613, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.5238996308406841, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6827, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.6337195832994496, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6894, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.6348308055489945, + "learning_rate": 5.210957484346314e-05, + "loss": 0.8044, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.4025760674722977, + "learning_rate": 5.195794670011776e-05, + "loss": 0.6456, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4452860285112815, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6119, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.5334380567489596, + "learning_rate": 5.165512124837344e-05, + "loss": 0.8085, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.514311178905084, + "learning_rate": 5.150392484425728e-05, + "loss": 0.6978, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.4399321327958649, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7175, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.4772773737004066, + "learning_rate": 5.120196693701267e-05, + "loss": 0.7418, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.5594951719696981, + "learning_rate": 5.105120633557634e-05, + "loss": 0.7048, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4157946441972806, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6922, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.4806133334704124, + "learning_rate": 5.075012408804458e-05, + "loss": 0.7044, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.5117520912077905, + "learning_rate": 5.059980334102637e-05, + "loss": 0.7487, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.569944929975138, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.8501, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.5969408820785443, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.8674, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.4941985486441663, + "learning_rate": 5.014972799220403e-05, + "loss": 0.82, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.6502519671746156, + "learning_rate": 5.000000000000002e-05, + "loss": 0.8603, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.42783689687814525, + "learning_rate": 4.985042131538545e-05, + "loss": 0.7035, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.39953505531503863, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.6791, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5454433994457187, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6927, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.4846780456609193, + "learning_rate": 4.940258557148765e-05, + "loss": 0.7063, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.521882062564805, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.7471, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.5398532028341828, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7122, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.4532324599227224, + "learning_rate": 4.895610964891923e-05, + "loss": 0.7293, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.545815898973468, + "learning_rate": 4.880758859890536e-05, + "loss": 0.7505, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.47865165562544837, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7213, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.4891032017620811, + "learning_rate": 4.851100554686021e-05, + "loss": 0.7227, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.6903768073965876, + "learning_rate": 4.836294443047088e-05, + "loss": 0.7913, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.5174446406993433, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6764, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.5456937956755602, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.756, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.48884842701907966, + "learning_rate": 4.791968802404648e-05, + "loss": 0.7131, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.46462372035273325, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6943, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.37672762008403704, + "learning_rate": 4.762496061632814e-05, + "loss": 0.6181, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.5825245132202216, + "learning_rate": 4.747783129228656e-05, + "loss": 0.7794, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.45910540935082, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7712, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.6243536274834849, + "learning_rate": 4.718404360058966e-05, + "loss": 0.8211, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.5785678929556791, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.8172, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.5632530690227657, + "learning_rate": 4.689088677427249e-05, + "loss": 0.8021, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.5924913339482613, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.7137, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.6408171320388871, + "learning_rate": 4.659836431497563e-05, + "loss": 0.8802, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.43466130623589433, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7327, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.41419333541385567, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6299, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.6802342126781251, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.8703, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.516618660644921, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7347, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.5224342097400351, + "learning_rate": 4.586985643347717e-05, + "loss": 0.657, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.482786023733216, + "learning_rate": 4.572463804170263e-05, + "loss": 0.6935, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.5440058879369418, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6983, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.5802944767597759, + "learning_rate": 4.543468791472131e-05, + "loss": 0.7421, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.5377633957272775, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.7127, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5207231642793138, + "learning_rate": 4.514538954847064e-05, + "loss": 0.816, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.46890830943659795, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.7444, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.5239841467518617, + "learning_rate": 4.485674639850333e-05, + "loss": 0.7862, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.4558894972380989, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7325, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.37375074027943683, + "learning_rate": 4.456876191254582e-05, + "loss": 0.6262, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.5494650548069104, + "learning_rate": 4.442501774383515e-05, + "loss": 0.9288, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4818770521338313, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6742, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.3657880080152438, + "learning_rate": 4.413802770115816e-05, + "loss": 0.5971, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.4221079896962407, + "learning_rate": 4.399478268418771e-05, + "loss": 0.7058, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4764515830267938, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6907, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.4738166520918258, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.8141, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.38835481161871727, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.681, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4565891315033268, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7752, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.4608422462138753, + "learning_rate": 4.328107473805487e-05, + "loss": 0.6388, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.6591127677132887, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.8727, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4862142948347619, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6995, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.5528915723787529, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.7565, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.5284686733306023, + "learning_rate": 4.271315449981934e-05, + "loss": 0.7969, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.6239566996614822, + "learning_rate": 4.257160104963696e-05, + "loss": 0.8646, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.503868202525054, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.7229, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.47973915900013236, + "learning_rate": 4.228900904120895e-05, + "loss": 0.7528, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.5042533217056323, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7988, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.5117440316579105, + "learning_rate": 4.200710636738189e-05, + "loss": 0.792, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.48090769570543257, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.6092, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.4546460753128134, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7465, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.4778685816741002, + "learning_rate": 4.158555222253771e-05, + "loss": 0.7818, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.4172931822775071, + "learning_rate": 4.14453824841132e-05, + "loss": 0.6907, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4416266010096713, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6804, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.41224064168095553, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.6721, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.44723339787007826, + "learning_rate": 4.102592405835536e-05, + "loss": 0.7507, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5353240653381068, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7261, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.4561756949502303, + "learning_rate": 4.074716493968975e-05, + "loss": 0.6876, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.40559531788607533, + "learning_rate": 4.060805057932359e-05, + "loss": 0.6803, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.42581332166379354, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6682, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.48054075181260647, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.6991, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.49404378864320575, + "learning_rate": 4.019177327749822e-05, + "loss": 0.7297, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.6032366633974174, + "learning_rate": 4.00533708178334e-05, + "loss": 0.8139, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.6478730032015604, + "learning_rate": 3.991514736790258e-05, + "loss": 0.6995, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.4753162644667982, + "learning_rate": 3.977710334046193e-05, + "loss": 0.7458, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.546975265700906, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7909, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.38565874944226974, + "learning_rate": 3.950155520139581e-05, + "loss": 0.5954, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.5505491228691735, + "learning_rate": 3.936405191259891e-05, + "loss": 0.7357, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.46402518754307825, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7473, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.5596790726024379, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.7905, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.45655679397131826, + "learning_rate": 3.895263009479534e-05, + "loss": 0.7756, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.6093963165564299, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.8257, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.4738455718490557, + "learning_rate": 3.867925968395085e-05, + "loss": 0.7141, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.5129489762341312, + "learning_rate": 3.854284894414122e-05, + "loss": 0.7099, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.4753950044121046, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7651, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.49435225998858384, + "learning_rate": 3.82705784324618e-05, + "loss": 0.6927, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.5149207631621087, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.7084, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.48962380021064045, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6561, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.4817734820863134, + "learning_rate": 3.786355617847385e-05, + "loss": 0.6415, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.46244492687186617, + "learning_rate": 3.772825265187802e-05, + "loss": 0.6482, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4456649711346683, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7377, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.47727175429766844, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.7589, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.4771095634002514, + "learning_rate": 3.732345940279893e-05, + "loss": 0.7587, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.5224356015984947, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7609, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.47419551634832074, + "learning_rate": 3.705453237352227e-05, + "loss": 0.6966, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.6484886986470106, + "learning_rate": 3.692035060534088e-05, + "loss": 0.8549, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.40119852570672876, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7171, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.5106224372744584, + "learning_rate": 3.665255256532638e-05, + "loss": 0.7613, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.5559868341959251, + "learning_rate": 3.651893709317887e-05, + "loss": 0.8186, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.42241843133295354, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6973, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.3997332202408528, + "learning_rate": 3.625227523958252e-05, + "loss": 0.7467, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.542573865655012, + "learning_rate": 3.611922965442648e-05, + "loss": 0.7384, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4158894795983539, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6843, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.5562155006440779, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.8085, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.44157287828678954, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.6756, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.5523897884351046, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7646, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.40132248104520923, + "learning_rate": 3.545687101972013e-05, + "loss": 0.6557, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.45195078908680764, + "learning_rate": 3.53249759200601e-05, + "loss": 0.6369, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4884570689666154, + "learning_rate": 3.519327394983888e-05, + "loss": 0.8075, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.4884060026975244, + "learning_rate": 3.506176550233863e-05, + "loss": 0.7314, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.48232054442174005, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6823, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4782472833326341, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6893, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.587509602421774, + "learning_rate": 3.46684052203088e-05, + "loss": 0.8276, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.46554498264893784, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.7039, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.5771668265979287, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6921, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.43425107121264933, + "learning_rate": 3.427680074531113e-05, + "loss": 0.6942, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.5690585351463997, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.7757, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.5866616526886929, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7548, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.3859883378448622, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6484, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.5445627210962285, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.843, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5125557887469259, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7498, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.5395159672227288, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.7333, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.5829779547713551, + "learning_rate": 3.336994413891828e-05, + "loss": 0.733, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.44359651415082885, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7156, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.4547494731971776, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.684, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.5131654521142365, + "learning_rate": 3.298426809706928e-05, + "loss": 0.7467, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.7262209422589005, + "learning_rate": 3.285610914348332e-05, + "loss": 0.8295, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.4936651159819274, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.6335, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.5554414008987123, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.8386, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.4408668117277434, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6766, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.6336439233367365, + "learning_rate": 3.234548216567049e-05, + "loss": 0.8759, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.539206563480282, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.7304, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.5727836618844765, + "learning_rate": 3.209137931341143e-05, + "loss": 0.7783, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.43515016127065365, + "learning_rate": 3.196463187590929e-05, + "loss": 0.7522, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.48313578334329116, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.7079, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4296980252575872, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7411, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.5549622257031763, + "learning_rate": 3.158561005793402e-05, + "loss": 0.7094, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.4829372579543755, + "learning_rate": 3.145967754102691e-05, + "loss": 0.812, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5241206593297438, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7042, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.5021517259541153, + "learning_rate": 3.120842689807468e-05, + "loss": 0.7994, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.45570593394802356, + "learning_rate": 3.108310952230212e-05, + "loss": 0.7288, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.4469788796959992, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7239, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.6211504895337312, + "learning_rate": 3.083309253324651e-05, + "loss": 0.8837, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.5173983081339508, + "learning_rate": 3.070839366655215e-05, + "loss": 0.7577, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.6005497546462576, + "learning_rate": 3.058390171511196e-05, + "loss": 0.8438, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.6769735434298438, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.7799, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.49118468134244414, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6405, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.7473252064539618, + "learning_rate": 3.021167106673928e-05, + "loss": 0.8578, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.5319378972264717, + "learning_rate": 3.008801048763914e-05, + "loss": 0.6932, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.4476301854869852, + "learning_rate": 2.996455867635155e-05, + "loss": 0.697, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4885567824260128, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7583, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.5086670274026673, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.7399, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.6873643208618484, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.6774, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.5727613478332071, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.835, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.4545663499750636, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.633, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.5094745047780689, + "learning_rate": 2.922825253307947e-05, + "loss": 0.7591, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5811358589874416, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7293, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.5202623326748433, + "learning_rate": 2.898450393337977e-05, + "loss": 0.7304, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.453238071655243, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.7493, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.47978793526594116, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6386, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.5119181666210523, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.7606, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.5439472122515798, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.7197, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.5127205371166901, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7982, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.4721633286931549, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.7265, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.48143401096482086, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.696, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4833593941086222, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6882, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.535180650617051, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.7315, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.6944748078696262, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.6996, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.5037578131433572, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7503, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.4495767237600222, + "learning_rate": 2.753992680872457e-05, + "loss": 0.7238, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.4483524790086883, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6971, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.5180856413717794, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7512, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.45358769303728175, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.7662, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.6204504192047962, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.8175, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.582202298948077, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.8074, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.5809861698330605, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.6598, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.49271726527820786, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.7314, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5363551881959038, + "learning_rate": 2.659414712405398e-05, + "loss": 0.7172, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.4926777999825674, + "learning_rate": 2.647690737490106e-05, + "loss": 0.7787, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.46908259578162226, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.7163, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.5397282744472077, + "learning_rate": 2.6243086879379e-05, + "loss": 0.814, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.4697132224477373, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.5815, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.42579868398354936, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6658, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.4569791807916529, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7226, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.39669229549545293, + "learning_rate": 2.577809166078716e-05, + "loss": 0.7225, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.5297173041860715, + "learning_rate": 2.566239608465838e-05, + "loss": 0.7861, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4864412981501662, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7326, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.5589885314689083, + "learning_rate": 2.543167122732918e-05, + "loss": 0.7526, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.4867758716241291, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.7805, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.5318053199685544, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7644, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.5354608205298255, + "learning_rate": 2.508725484101684e-05, + "loss": 0.682, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.38066758056826167, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.6337, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.49654962314840656, + "learning_rate": 2.485876184956928e-05, + "loss": 0.738, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.4173555980908285, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.6503, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.4833517675096117, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.7633, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.4457017967605104, + "learning_rate": 2.451770608467432e-05, + "loss": 0.7198, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.48172126652085867, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.7486, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.4691379373475714, + "learning_rate": 2.429146201687538e-05, + "loss": 0.7917, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4366557171419435, + "learning_rate": 2.417867893002387e-05, + "loss": 0.7614, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.6389432991012376, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.7562, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.45218296818007325, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.6676, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.5127013510044014, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6803, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.550397023114835, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.722, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.588260946539843, + "learning_rate": 2.361816641743303e-05, + "loss": 0.8028, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.4293044340924837, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.675, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.43436506152113274, + "learning_rate": 2.339555568810221e-05, + "loss": 0.704, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.4693231671304673, + "learning_rate": 2.328459328616759e-05, + "loss": 0.7876, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.4534259390563445, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7488, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.43524954894664175, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6833, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.5173068347282955, + "learning_rate": 2.295308190543859e-05, + "loss": 0.6967, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.600135698112275, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.7668, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.5785766990626001, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.9031, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.4850535026822109, + "learning_rate": 2.262364118471805e-05, + "loss": 0.6891, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.9696176740359524, + "learning_rate": 2.251428928971102e-05, + "loss": 0.8852, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.6009802936351409, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.807, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.4672352030261569, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.7554, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4218704627178395, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.72, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.5059599658543962, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6995, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.5656753835448954, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.7585, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.4774116641105069, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6692, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.5271022801821177, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.8116, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.5313883619232115, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.8125, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.5071211777372121, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6735, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.4852707362890887, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.5942, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.5550116153852038, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.7453, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.43543146198658933, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6987, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.42424194634554874, + "learning_rate": 2.111388852214001e-05, + "loss": 0.677, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.6108351206781172, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.7572, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5559873692980049, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7539, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.3937613220805676, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.5808, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.5023593030301023, + "learning_rate": 2.069097260929439e-05, + "loss": 0.709, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.45169725609074757, + "learning_rate": 2.058583491552465e-05, + "loss": 0.7311, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.5282539621355999, + "learning_rate": 2.048093436450603e-05, + "loss": 0.729, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.6313553350650196, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.8341, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.5219722982471708, + "learning_rate": 2.027184594300898e-05, + "loss": 0.8209, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.5833231239077713, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.8449, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.43843108756571597, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.6107, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.49291061450379847, + "learning_rate": 1.995999968955641e-05, + "loss": 0.7863, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.5327844658716501, + "learning_rate": 1.985652854842247e-05, + "loss": 0.7032, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.5629117337994617, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.8057, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.43057064449013455, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6467, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.40144706648058803, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.6154, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.41224528576852115, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.7186, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.5493275235326429, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7174, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.4774079575940184, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.6967, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.48463906773266147, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.7209, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.5163153518005792, + "learning_rate": 1.903740076395151e-05, + "loss": 0.8135, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.5456312112526748, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.748, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.4649146794818222, + "learning_rate": 1.883503039577894e-05, + "loss": 0.7371, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.5167663833581301, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7598, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.39965746354745346, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.6812, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.5372340369217614, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.7217, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4367158355141629, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7026, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.5245846395952943, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.8301, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.44809015973199046, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.6098, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.4896095179374265, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.684, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.5234106203427703, + "learning_rate": 1.803526775107217e-05, + "loss": 0.7691, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.46476988376251316, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.6239, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.49874973349599516, + "learning_rate": 1.783776873795994e-05, + "loss": 0.7547, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.5905184413825677, + "learning_rate": 1.773938710748706e-05, + "loss": 0.8502, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.5035949347611718, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.7307, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.5738022003972325, + "learning_rate": 1.754336106761927e-05, + "loss": 0.7793, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.5424653390734194, + "learning_rate": 1.744571724358789e-05, + "loss": 0.7757, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.46252207283074476, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.7, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.4907851167397113, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7292, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.4178661965994772, + "learning_rate": 1.715426605184407e-05, + "loss": 0.6679, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.47754089253765014, + "learning_rate": 1.705761004839911e-05, + "loss": 0.6702, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.6633324760822404, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7803, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.4727307422889256, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6659, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.5730428573362569, + "learning_rate": 1.676912926028007e-05, + "loss": 0.8064, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.5099587028969585, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.746, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.5086193753246331, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.7935, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.5240438660890212, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.7281, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4504134348879838, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7004, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.46641359685348005, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.6662, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.5838658371077295, + "learning_rate": 1.619888594394382e-05, + "loss": 0.7176, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.7012391602970637, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.872, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.45478183438206427, + "learning_rate": 1.601080376443763e-05, + "loss": 0.6577, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.5536165423047683, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.7681, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.6019648360041853, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.8668, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.528126687311034, + "learning_rate": 1.573056222621453e-05, + "loss": 0.8946, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.5787961931296473, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.7104, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.5584247292865336, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7912, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.5592127404421677, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.8268, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 1.6459891604457142, + "learning_rate": 1.536043110654809e-05, + "loss": 0.7625, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 1.9460915202760372, + "learning_rate": 1.526852950422226e-05, + "loss": 0.7972, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.49389502057942186, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.7326, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.5296872403780049, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.7438, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5206900777405126, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.8099, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.562759680643968, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.7302, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.43740427820467087, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.642, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.5809060492495025, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7144, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.5078337671329715, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.7088, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.4223935317492648, + "learning_rate": 1.454244833620102e-05, + "loss": 0.7237, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.4306772984469156, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6317, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.48965342980711823, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.6731, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.48871038552264057, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.7343, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.48062667686149924, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7227, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.47089628085218616, + "learning_rate": 1.409693244743192e-05, + "loss": 0.6916, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.5779379966268988, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.7505, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.6465429524456934, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.9103, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.6218601157422986, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.765, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.5156130773122601, + "learning_rate": 1.37451354812416e-05, + "loss": 0.7075, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.40010584341983213, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6547, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.6052419811423837, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.7519, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.5380969282925244, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.7498, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.610094785258102, + "learning_rate": 1.339745962155613e-05, + "loss": 0.8135, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.4548361760721386, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.671, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.5626173142904772, + "learning_rate": 1.322517230541096e-05, + "loss": 0.8198, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.5625778147640857, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.7443, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.4606738315995986, + "learning_rate": 1.30539214797198e-05, + "loss": 0.6705, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.42348707737647995, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.6206, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.7293607528561209, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7697, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.4803752752873544, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.7195, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.4721616013466926, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.7272, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.5749202610287232, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7178, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.4918692092749649, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.6898, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.43734853080318675, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6955, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.5224295333686861, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.78, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.5922274092117513, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.7297, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.5705340603856977, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.8368, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.4910596221861991, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7026, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.4467032714737605, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.6869, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.4207718435624483, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.7002, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.5165282223653747, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7054, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.4737730901307995, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.6875, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.4563247614617513, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.693, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.47543609508727075, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6393, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.3476281053216799, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.6165, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.5202231870957309, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.759, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.42654028759472845, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6388, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.46701105025080786, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.6391, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.5536664806031145, + "learning_rate": 1.123914688596409e-05, + "loss": 0.7509, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.6799427233853165, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.837, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.4786584684295808, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.6551, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.6346363261580554, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.7388, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.5249745108943299, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7307, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.5420943687474257, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.81, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.44520089650783257, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.707, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.4415726863620001, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6431, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.5168783328821801, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.6849, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.4408776737461188, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.7219, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5600784676932644, + "learning_rate": 1.045650195232819e-05, + "loss": 0.8664, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.465766389137664, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.7233, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.5088576755246479, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.7127, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.631801694369338, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7959, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.5218246264212684, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.8172, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.4825524954277326, + "learning_rate": 1.007519208596045e-05, + "loss": 0.7186, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5263756401894659, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7588, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.5607804957473732, + "learning_rate": 9.924546254786493e-06, + "loss": 0.7896, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.5317198707514146, + "learning_rate": 9.849626695403324e-06, + "loss": 0.7555, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4830066670495747, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7157, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.4273456175724436, + "learning_rate": 9.700595407649805e-06, + "loss": 0.6735, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.45417802530711415, + "learning_rate": 9.62648412430951e-06, + "loss": 0.7218, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.42860751675190184, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6094, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.38947169025510137, + "learning_rate": 9.479071385238892e-06, + "loss": 0.6599, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.4075364372659137, + "learning_rate": 9.40577036970538e-06, + "loss": 0.6509, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.4131338362822137, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6275, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.474524778672002, + "learning_rate": 9.259980141081115e-06, + "loss": 0.6994, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4929519260488483, + "learning_rate": 9.187491363342093e-06, + "loss": 0.8072, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.5739079113452518, + "learning_rate": 9.115273765538202e-06, + "loss": 0.7298, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.4393783480975882, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6485, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.492183761772122, + "learning_rate": 8.971652971536148e-06, + "loss": 0.7441, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.45490549116062656, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6082, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.6727059960569023, + "learning_rate": 8.829119474567671e-06, + "loss": 0.7843, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.46021591435431813, + "learning_rate": 8.758260995011825e-06, + "loss": 0.7266, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.43787677770044686, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6711, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.5054525066581391, + "learning_rate": 8.617361631727138e-06, + "loss": 0.6961, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.4219011331339254, + "learning_rate": 8.547321168745193e-06, + "loss": 0.6766, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.6642900014098958, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7591, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.5184000275481225, + "learning_rate": 8.408059725858719e-06, + "loss": 0.6511, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.48729918547181306, + "learning_rate": 8.338839161809997e-06, + "loss": 0.7974, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4763858189554415, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6517, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.4797922479036312, + "learning_rate": 8.201219382016556e-06, + "loss": 0.6274, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.5065831946679562, + "learning_rate": 8.132820577225387e-06, + "loss": 0.7318, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.5505070909082705, + "learning_rate": 8.064696101776358e-06, + "loss": 0.8353, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.48013622734320494, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6355, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.48653054810125534, + "learning_rate": 7.929270951805178e-06, + "loss": 0.7309, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.6003193953384854, + "learning_rate": 7.861970681683051e-06, + "loss": 0.8524, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.5207762433264173, + "learning_rate": 7.794945549701993e-06, + "loss": 0.7493, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.5234557463199222, + "learning_rate": 7.728195756009204e-06, + "loss": 0.7591, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.506189512868477, + "learning_rate": 7.661721499929753e-06, + "loss": 0.741, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.40236452875488554, + "learning_rate": 7.595522979965819e-06, + "loss": 0.6004, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.48698958631856193, + "learning_rate": 7.529600393796232e-06, + "loss": 0.7003, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.46135947514065956, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6278, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.4282588139719773, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.6055, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.49488868238005507, + "learning_rate": 7.333490202478666e-06, + "loss": 0.7311, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.49645661708457584, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6515, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.4632243612209577, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6983, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.4394766733689113, + "learning_rate": 7.1398704525792e-06, + "loss": 0.6608, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4621408336491037, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6321, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.4748794449223368, + "learning_rate": 7.012176770311862e-06, + "loss": 0.8009, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.5811079235129704, + "learning_rate": 6.948746347689183e-06, + "loss": 0.7525, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.5536825290584947, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.7958, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.4449735221813921, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.6269, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.4889710443032619, + "learning_rate": 6.760123024328624e-06, + "loss": 0.7816, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.5349562942688116, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6645, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.4934466804295362, + "learning_rate": 6.635765971293484e-06, + "loss": 0.6681, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.9936281137081363, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.6872, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.488307043217961, + "learning_rate": 6.512524116523633e-06, + "loss": 0.7075, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.6075844527604656, + "learning_rate": 6.451321849032288e-06, + "loss": 0.8007, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.44454223076412264, + "learning_rate": 6.390398932093555e-06, + "loss": 0.7356, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.5519065208356204, + "learning_rate": 6.329755547632499e-06, + "loss": 0.771, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.5478271754599924, + "learning_rate": 6.269391876739495e-06, + "loss": 0.7592, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.4408424564893833, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6955, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.6270738931351739, + "learning_rate": 6.149504395842087e-06, + "loss": 0.7421, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.9517628597807377, + "learning_rate": 6.089980943839924e-06, + "loss": 0.8323, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.5364284273406238, + "learning_rate": 6.030737921409169e-06, + "loss": 0.6865, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.6419546535789074, + "learning_rate": 5.971775505458444e-06, + "loss": 0.7216, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.5269843159023666, + "learning_rate": 5.913093872058528e-06, + "loss": 0.6767, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.4352999520696695, + "learning_rate": 5.854693196441641e-06, + "loss": 0.7933, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5580198182493007, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.8353, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.587909658106802, + "learning_rate": 5.738735415290642e-06, + "loss": 0.6288, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.5520557307410818, + "learning_rate": 5.681178656024055e-06, + "loss": 0.7293, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.6114940005699151, + "learning_rate": 5.623903547074549e-06, + "loss": 0.7133, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.6103014329036469, + "learning_rate": 5.566910259474289e-06, + "loss": 0.8082, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.5046918194388451, + "learning_rate": 5.510198963413881e-06, + "loss": 0.6353, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.5830522248552155, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6757, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.4094809100101818, + "learning_rate": 5.397623022464226e-06, + "loss": 0.6888, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.5414726816177821, + "learning_rate": 5.341758713743828e-06, + "loss": 0.7496, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.6107634014919001, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6886, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.4106943693814969, + "learning_rate": 5.230878253907912e-06, + "loss": 0.6593, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.5230257324815101, + "learning_rate": 5.175862433898282e-06, + "loss": 0.6109, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.7562403955437415, + "learning_rate": 5.121129773156663e-06, + "loss": 0.9042, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.558874746914631, + "learning_rate": 5.066680435123106e-06, + "loss": 0.8388, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.48021688999202583, + "learning_rate": 5.012514582391592e-06, + "loss": 0.6812, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4602437546671285, + "learning_rate": 4.95863237670956e-06, + "loss": 0.7249, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.47383038109374326, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6403, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.5274325832002795, + "learning_rate": 4.851719549248301e-06, + "loss": 0.7773, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.46136179468542826, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6554, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.4991708505289876, + "learning_rate": 4.745943229770122e-06, + "loss": 0.7029, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.626880797898286, + "learning_rate": 4.693481655885257e-06, + "loss": 0.7866, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.5845448167722781, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7247, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.4862178222397938, + "learning_rate": 4.58941246311464e-06, + "loss": 0.653, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.4861391733533597, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6721, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.42820206581647385, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6301, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.560034769193496, + "learning_rate": 4.435445885824285e-06, + "loss": 0.8402, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.570800689301257, + "learning_rate": 4.384694230432984e-06, + "loss": 0.6529, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.42588385382805777, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6535, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.5415931163910935, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.7774, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.6272470407089946, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.7145, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4535081229895934, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6937, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.6417022320016039, + "learning_rate": 4.135221781914034e-06, + "loss": 0.7383, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.45964017202954827, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.6948, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.5320742897519447, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6478, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.4134215065349257, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6269, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.607172942134624, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.6956, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.5252893129734731, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7253, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.46582545750354765, + "learning_rate": 3.845303192289074e-06, + "loss": 0.7394, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.4094338874129953, + "learning_rate": 3.797987556970495e-06, + "loss": 0.6637, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4636293196252806, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6563, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.5415115913571019, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.8612, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.4522640549950337, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.7838, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.38016753165518974, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6309, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.39069729040605633, + "learning_rate": 3.565721283350931e-06, + "loss": 0.684, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.45806841285698824, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.6847, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.43406155609265734, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6533, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.5676457830066718, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6769, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.4362044805517463, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.7128, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.4448641004513165, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6975, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.4245635378541507, + "learning_rate": 3.296506110302422e-06, + "loss": 0.6425, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.4422408012266385, + "learning_rate": 3.252646840332918e-06, + "loss": 0.7702, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4657874956661579, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6791, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.4812393443605173, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6028, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.43947797613439554, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.6834, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.46533180700732835, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.673, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.5505691048109808, + "learning_rate": 3.037686613916857e-06, + "loss": 0.6438, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.6099905659325539, + "learning_rate": 2.995562691985898e-06, + "loss": 0.7642, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.4772093942998137, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6966, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.8552915050342436, + "learning_rate": 2.912183982969385e-06, + "loss": 1.0042, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.47144045944851415, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.6995, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4764511314947748, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6833, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.40338923322749526, + "learning_rate": 2.789290617426765e-06, + "loss": 0.6012, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.5469624653093387, + "learning_rate": 2.748906571878207e-06, + "loss": 0.6508, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.4033243451447917, + "learning_rate": 2.708812932856253e-06, + "loss": 0.5882, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.5650299736539638, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.748, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.4378622632528653, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.6493, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4459629050866768, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6236, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.6222359720620512, + "learning_rate": 2.551344823532964e-06, + "loss": 0.7108, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.5122806021831506, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.7062, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5414807022479352, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7937, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.5881527424573568, + "learning_rate": 2.436298790049363e-06, + "loss": 0.7238, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.4265515449250894, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.644, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.5090537767426827, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7131, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.4973165848843641, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.7288, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.49711395640263095, + "learning_rate": 2.286983355164529e-06, + "loss": 0.8063, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.45003147938779353, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6672, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.5263873461041867, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.6554, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.4473989869953113, + "learning_rate": 2.178060137750071e-06, + "loss": 0.7079, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.5555097211297417, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7683, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.4139824487629015, + "learning_rate": 2.106905034576112e-06, + "loss": 0.6547, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.6032150172121085, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.7181, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.5419524456251257, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6984, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.4466457886122112, + "learning_rate": 2.002365067264289e-06, + "loss": 0.7033, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.5912517841993868, + "learning_rate": 1.968103545249611e-06, + "loss": 0.7759, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4169609800985717, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6302, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.3825795729910924, + "learning_rate": 1.900458817025097e-06, + "loss": 0.6144, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.46253675399641014, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.7467, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.614394404605922, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.7495, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.5042783672677922, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.8186, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.5560448870125844, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.7447, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.490352727204397, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.7362, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.44728772727080823, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.6484, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.49411244121884584, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.7414, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.3850409913369315, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.5999, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.46460818555883665, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.6602, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.4258599918242338, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.6777, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.5053769678163469, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6584, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.4327361124299862, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.651, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.4267839667595951, + "learning_rate": 1.489364501100332e-06, + "loss": 0.7154, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.5273772134438605, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6509, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.47718116203120287, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6662, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.5424380948717015, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.7798, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.5015991268010247, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7824, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.723900086009267, + "learning_rate": 1.344477780953346e-06, + "loss": 0.7137, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.4085920312350529, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.6213, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.49665624100972683, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7234, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.5304273846910255, + "learning_rate": 1.261080262743297e-06, + "loss": 0.7091, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.4961187437951281, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.7463, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.5448542537532, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.7914, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.3958449956876178, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.6833, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.402377124921106, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6247, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4151752614619234, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6616, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.4802399299257456, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.7318, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.3979602633245147, + "learning_rate": 1.076809502472831e-06, + "loss": 0.6792, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4669863303336634, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.7233, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.5489679394586826, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6343, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.5123990189343095, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6827, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5575525400757082, + "learning_rate": 9.780089980330642e-07, + "loss": 0.7718, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.5732715043210574, + "learning_rate": 9.540479264726676e-07, + "loss": 0.7825, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.5111553239009662, + "learning_rate": 9.303826211592315e-07, + "loss": 0.6764, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.47616444580861356, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6491, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.46051172278486796, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6965, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.5059183466848223, + "learning_rate": 8.611620049653879e-07, + "loss": 0.731, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.49106201960409346, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6666, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.5782792211532685, + "learning_rate": 8.16495030759501e-07, + "loss": 0.7777, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.5543293060255573, + "learning_rate": 7.946057760332193e-07, + "loss": 0.7129, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.5528950661278335, + "learning_rate": 7.730127636723539e-07, + "loss": 0.7632, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.3778988008025402, + "learning_rate": 7.517160581569372e-07, + "loss": 0.56, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.4139560921395551, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6697, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5485450536674931, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6698, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.44495726703478017, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6781, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.6346426038661458, + "learning_rate": 6.694935631773258e-07, + "loss": 0.7519, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.435310939222078, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6706, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.4741403213511459, + "learning_rate": 6.301617681886863e-07, + "loss": 0.685, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.47941646677875055, + "learning_rate": 6.109409416834688e-07, + "loss": 0.7037, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.5271890177485758, + "learning_rate": 5.920169059947411e-07, + "loss": 0.7678, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.5799828402558466, + "learning_rate": 5.733897176325665e-07, + "loss": 0.7509, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.49652524125300823, + "learning_rate": 5.550594322205504e-07, + "loss": 0.7167, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.5713359605247931, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7461, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.5331404008804096, + "learning_rate": 5.192897883082747e-07, + "loss": 0.8511, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.48242135683654225, + "learning_rate": 5.018505366216175e-07, + "loss": 0.6862, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4957894244323012, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6923, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.3579312113376426, + "learning_rate": 4.678634341683252e-07, + "loss": 0.5774, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.41496835638705143, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6858, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4641364341036602, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6982, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.47077736559547806, + "learning_rate": 4.191120373120749e-07, + "loss": 0.6239, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.5389246898448312, + "learning_rate": 4.034562351727389e-07, + "loss": 0.7296, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.5032207629100807, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6301, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.4950965243486184, + "learning_rate": 3.73036907948543e-07, + "loss": 0.7424, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 1.3905925807886583, + "learning_rate": 3.582734737004101e-07, + "loss": 0.7117, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4544486959060869, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.686, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.43078132109881695, + "learning_rate": 3.296392843612273e-07, + "loss": 0.6374, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.4541643311828197, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.7426, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.46086034918164714, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6856, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.4963135460344512, + "learning_rate": 2.889203328748424e-07, + "loss": 0.6885, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.5880509527579628, + "learning_rate": 2.759428007315212e-07, + "loss": 0.7538, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.45817989964313643, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6883, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.47083292914551456, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.7467, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.4270945612153523, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.6991, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.43395210880349344, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6581, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.5120487977928043, + "learning_rate": 2.15522751523467e-07, + "loss": 0.7571, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.5426408727132688, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.7361, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.5048709941315147, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6776, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.4406697455325257, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.6452, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.49960312933395784, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.7075, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4827963100118666, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.677, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.5025373556902175, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.7196, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.4193837396130738, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.6838, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.44675396034287024, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.7201, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.4900721781740654, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6876, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.4506382278303298, + "learning_rate": 1.170343437301491e-07, + "loss": 0.7381, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.34987146431543165, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.578, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.5034243662178048, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.7319, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.4227055102320483, + "learning_rate": 9.330275400666332e-08, + "loss": 0.6679, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.45871344074784653, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7228, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.57147546388469, + "learning_rate": 7.8973337634336e-08, + "loss": 0.7528, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.58725852317797, + "learning_rate": 7.225618800222877e-08, + "loss": 0.755, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.39234786416075723, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6112, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.4341603371983377, + "learning_rate": 5.971710613821291e-08, + "loss": 0.6532, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.46665268783732106, + "learning_rate": 5.389521134989695e-08, + "loss": 0.7195, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.4343399933377864, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6428, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.5401607092202143, + "learning_rate": 4.314680098592705e-08, + "loss": 0.7836, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.48561638655807204, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.7433, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.5705619733589412, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7619, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.4723919850354279, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.6844, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.6641172197701478, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.8525, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.49334083555221586, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7167, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.4676196339068881, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.6979, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.7208173416387388, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.8211, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.47014724542518643, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6786, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.8022893409373152, + "learning_rate": 9.555535917993297e-09, + "loss": 0.8995, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.6701439219566838, + "learning_rate": 7.315984495548378e-09, + "loss": 0.8955, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.43662100159815975, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6931, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.5397248237775718, + "learning_rate": 3.732667443390181e-09, + "loss": 0.6742, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.4175834153515395, + "learning_rate": 2.388912514017516e-09, + "loss": 0.6579, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.42630390827681963, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6214, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.4994756342367005, + "learning_rate": 5.972299119250125e-10, + "loss": 0.7364, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.4771620720318906, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.7127, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.4090006633917449, + "learning_rate": 0.0, + "loss": 0.6534, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1522821307072512.0, + "train_loss": 0.7911884547869364, + "train_runtime": 28059.907, + "train_samples_per_second": 1.069, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1522821307072512.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..06461bad72253ec3f573313ab1262f14be032f30 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "k_proj", + "down_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ddaf7c2b1b57ed0830d8135342e9e50886de2f53 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fdcc810a4d0279ffcc37f0eb3cef201b38a742e5705de63aec05fbeaa6fedec +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..7b8f3008a98b36d12f915dbd60226f676e8ba091 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:201058e5be6f78f9b81917e024d5fcc86aa13da0e17b59942fca6ad12920fac6 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0cc5d7d6ac85fdf80193e4cf5a62172db1a38d2 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.75233063420523, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.2255, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.0166309192256069, + "learning_rate": 7.017543859649123e-06, + "loss": 1.1534, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 1.132417232096234, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.2803, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.9352205978088273, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.3408, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.8267363258300194, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.353, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8607407611901363, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2463, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.7587680789518995, + "learning_rate": 2.456140350877193e-05, + "loss": 1.2031, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.7229754022476446, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.1776, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.7072269871649532, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1774, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.7979191350531996, + "learning_rate": 3.508771929824561e-05, + "loss": 1.2332, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.732784118427125, + "learning_rate": 3.859649122807018e-05, + "loss": 1.0808, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8714380275232437, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1643, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.7759061513691938, + "learning_rate": 4.56140350877193e-05, + "loss": 1.045, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.6991535365961644, + "learning_rate": 4.912280701754386e-05, + "loss": 0.9502, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 1.1604696086280564, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.167, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.7315881408984929, + "learning_rate": 5.6140350877192984e-05, + "loss": 1.0043, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.8424740742961367, + "learning_rate": 5.9649122807017544e-05, + "loss": 1.1145, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7494124445848623, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0957, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.6606564603126561, + "learning_rate": 6.666666666666667e-05, + "loss": 0.8768, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.8399242186106046, + "learning_rate": 7.017543859649122e-05, + "loss": 1.0244, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7826675090493671, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0998, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.5583109291998765, + "learning_rate": 7.719298245614036e-05, + "loss": 0.8725, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.6256868157176779, + "learning_rate": 8.070175438596491e-05, + "loss": 1.0145, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7795845544031471, + "learning_rate": 8.421052631578948e-05, + "loss": 1.0163, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.5955611793656965, + "learning_rate": 8.771929824561403e-05, + "loss": 1.0053, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.4745775961075087, + "learning_rate": 9.12280701754386e-05, + "loss": 0.8087, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.7200636042605665, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9417, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.6711680178676414, + "learning_rate": 9.824561403508771e-05, + "loss": 0.9923, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5953871452449566, + "learning_rate": 0.0001017543859649123, + "loss": 0.9618, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.6178325939358567, + "learning_rate": 0.00010526315789473685, + "loss": 0.9101, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.6040273680854279, + "learning_rate": 0.00010877192982456141, + "loss": 0.9319, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.46179099610956637, + "learning_rate": 0.00011228070175438597, + "loss": 0.7867, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.7770792886943544, + "learning_rate": 0.00011578947368421053, + "loss": 1.0039, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.6762096229135146, + "learning_rate": 0.00011929824561403509, + "loss": 1.006, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.5679138696476846, + "learning_rate": 0.00012280701754385965, + "loss": 0.9809, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.733382199403288, + "learning_rate": 0.0001263157894736842, + "loss": 0.8995, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.7190720122239778, + "learning_rate": 0.0001298245614035088, + "loss": 0.984, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.5398148623052902, + "learning_rate": 0.00013333333333333334, + "loss": 0.8568, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6012517612506856, + "learning_rate": 0.0001368421052631579, + "loss": 0.9214, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.5024500935278182, + "learning_rate": 0.00014035087719298245, + "loss": 0.8186, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.5079206358540236, + "learning_rate": 0.00014385964912280703, + "loss": 0.789, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5774736050190657, + "learning_rate": 0.00014736842105263158, + "loss": 0.9261, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.608909561912015, + "learning_rate": 0.00015087719298245616, + "loss": 0.9369, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.655766067594755, + "learning_rate": 0.0001543859649122807, + "loss": 0.9121, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.561485841015699, + "learning_rate": 0.00015789473684210527, + "loss": 0.9179, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.544693161395086, + "learning_rate": 0.00016140350877192982, + "loss": 0.8253, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.7240102546369898, + "learning_rate": 0.0001649122807017544, + "loss": 0.9902, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6136903666564417, + "learning_rate": 0.00016842105263157895, + "loss": 0.9077, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.7395175295085202, + "learning_rate": 0.00017192982456140353, + "loss": 0.985, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.42343314793329256, + "learning_rate": 0.00017543859649122806, + "loss": 0.746, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.7340484577277879, + "learning_rate": 0.00017894736842105264, + "loss": 1.0382, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.8434600088153975, + "learning_rate": 0.0001824561403508772, + "loss": 1.0219, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.5230606202440629, + "learning_rate": 0.00018596491228070177, + "loss": 0.8054, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5088409387329998, + "learning_rate": 0.00018947368421052632, + "loss": 0.8681, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.612321126960915, + "learning_rate": 0.00019298245614035088, + "loss": 0.8984, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.48597178715632633, + "learning_rate": 0.00019649122807017543, + "loss": 0.861, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.49815647600195784, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.5957716890357231, + "learning_rate": 0.00019999985069241055, + "loss": 0.8735, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.5712024806738941, + "learning_rate": 0.00019999940277008808, + "loss": 0.8615, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.5149023413528725, + "learning_rate": 0.00019999865623437013, + "loss": 0.865, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.6329754633640781, + "learning_rate": 0.00019999761108748597, + "loss": 0.9009, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.6750849329772663, + "learning_rate": 0.00019999626733255662, + "loss": 1.0187, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.7041355749011133, + "learning_rate": 0.00019999462497359466, + "loss": 0.9702, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.6514821462486268, + "learning_rate": 0.00019999268401550447, + "loss": 0.9102, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.6236717189775234, + "learning_rate": 0.000199990444464082, + "loss": 0.9897, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5320977040159945, + "learning_rate": 0.00019998790632601496, + "loss": 0.9185, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.5634172509699379, + "learning_rate": 0.00019998506960888256, + "loss": 0.9061, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5067857180737967, + "learning_rate": 0.00019998193432115572, + "loss": 0.714, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.8713818108473501, + "learning_rate": 0.0001999785004721968, + "loss": 0.9746, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.6691172402828967, + "learning_rate": 0.00019997476807225985, + "loss": 0.8702, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.5715105590453465, + "learning_rate": 0.0001999707371324904, + "loss": 0.8766, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.7245869944292714, + "learning_rate": 0.00019996640766492543, + "loss": 1.0353, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.5718973923822783, + "learning_rate": 0.00019996177968249334, + "loss": 0.8917, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.5430722243096373, + "learning_rate": 0.0001999568531990141, + "loss": 0.8779, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.6601605568954748, + "learning_rate": 0.00019995162822919883, + "loss": 0.987, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.5605403622572899, + "learning_rate": 0.00019994610478865011, + "loss": 0.8821, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.5418175502646891, + "learning_rate": 0.0001999402828938618, + "loss": 0.7718, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.6293702158665475, + "learning_rate": 0.00019993416256221895, + "loss": 0.9803, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.6507407167279051, + "learning_rate": 0.00019992774381199778, + "loss": 0.9547, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.6484754200572854, + "learning_rate": 0.00019992102666236566, + "loss": 0.9405, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.884928118391995, + "learning_rate": 0.00019991401113338104, + "loss": 0.8783, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.6908795419087032, + "learning_rate": 0.00019990669724599336, + "loss": 0.9867, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.5824682851756094, + "learning_rate": 0.00019989908502204292, + "loss": 0.9331, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6316813393610857, + "learning_rate": 0.00019989117448426108, + "loss": 0.975, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.5352489975081022, + "learning_rate": 0.00019988296565626987, + "loss": 0.8376, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.47309528931486894, + "learning_rate": 0.00019987445856258206, + "loss": 0.8269, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5246505586243143, + "learning_rate": 0.00019986565322860115, + "loss": 0.9217, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.4841678812444819, + "learning_rate": 0.00019985654968062122, + "loss": 0.8478, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.4966241873726176, + "learning_rate": 0.00019984714794582683, + "loss": 0.85, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.5248880201307413, + "learning_rate": 0.00019983744805229296, + "loss": 0.8825, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.5061920170969972, + "learning_rate": 0.000199827450028985, + "loss": 0.8386, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.6017734401928818, + "learning_rate": 0.00019981715390575858, + "loss": 0.8824, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.7623650810154796, + "learning_rate": 0.00019980655971335945, + "loss": 1.0427, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.6642439159610598, + "learning_rate": 0.00019979566748342347, + "loss": 0.8763, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.49909208678794403, + "learning_rate": 0.00019978447724847652, + "loss": 0.832, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.46393029258690244, + "learning_rate": 0.00019977298904193437, + "loss": 0.8565, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.5883008873326704, + "learning_rate": 0.00019976120289810247, + "loss": 1.044, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.49221859513746163, + "learning_rate": 0.00019974911885217608, + "loss": 0.8893, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5871360596459384, + "learning_rate": 0.00019973673694024, + "loss": 0.8823, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.5563645290456422, + "learning_rate": 0.0001997240571992685, + "loss": 0.8756, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.5456951817526335, + "learning_rate": 0.00019971107966712518, + "loss": 0.8443, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5868406983232394, + "learning_rate": 0.00019969780438256293, + "loss": 0.9256, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.6397539530587217, + "learning_rate": 0.0001996842313852238, + "loss": 1.0257, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.48140395333780067, + "learning_rate": 0.00019967036071563877, + "loss": 0.7974, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.5224908629742993, + "learning_rate": 0.0001996561924152278, + "loss": 0.874, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.6876692164439447, + "learning_rate": 0.0001996417265262996, + "loss": 0.9259, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.558106466635048, + "learning_rate": 0.00019962696309205148, + "loss": 0.9256, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.6866064916024611, + "learning_rate": 0.0001996119021565693, + "loss": 1.0133, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.635731427860939, + "learning_rate": 0.0001995965437648273, + "loss": 0.9906, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.4682780527874329, + "learning_rate": 0.00019958088796268793, + "loss": 0.8046, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.5859345250297248, + "learning_rate": 0.0001995649347969019, + "loss": 0.9377, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.6351657217287303, + "learning_rate": 0.00019954868431510764, + "loss": 0.9576, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.6096188854419039, + "learning_rate": 0.00019953213656583168, + "loss": 0.9146, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.6730316913770924, + "learning_rate": 0.00019951529159848805, + "loss": 0.948, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.5963591112984713, + "learning_rate": 0.00019949814946337838, + "loss": 0.9112, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.5243383010499729, + "learning_rate": 0.00019948071021169174, + "loss": 0.8733, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.6218918641524634, + "learning_rate": 0.00019946297389550433, + "loss": 0.9845, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.510062739859879, + "learning_rate": 0.00019944494056777946, + "loss": 0.8241, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.5341151787626189, + "learning_rate": 0.00019942661028236745, + "loss": 0.8744, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.5059598231096198, + "learning_rate": 0.00019940798309400526, + "loss": 0.8382, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.516674937056129, + "learning_rate": 0.00019938905905831654, + "loss": 0.8452, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.5582815377012604, + "learning_rate": 0.00019936983823181132, + "loss": 0.8855, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.7055345194572793, + "learning_rate": 0.0001993503206718859, + "loss": 0.9673, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.8231949440189873, + "learning_rate": 0.00019933050643682269, + "loss": 1.015, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.588128888793964, + "learning_rate": 0.00019931039558578997, + "loss": 0.8971, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.6681449843660198, + "learning_rate": 0.00019928998817884182, + "loss": 0.9095, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.5550207384306163, + "learning_rate": 0.00019926928427691786, + "loss": 0.9013, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.524683912520627, + "learning_rate": 0.00019924828394184306, + "loss": 0.8574, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.7748097181550015, + "learning_rate": 0.00019922698723632767, + "loss": 0.9668, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.5283157806951922, + "learning_rate": 0.0001992053942239668, + "loss": 0.8686, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.5616820687558489, + "learning_rate": 0.0001991835049692405, + "loss": 0.8975, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5054594593622966, + "learning_rate": 0.00019916131953751342, + "loss": 0.8434, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.562220612042354, + "learning_rate": 0.0001991388379950346, + "loss": 0.7903, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.5603148933465106, + "learning_rate": 0.0001991160604089374, + "loss": 0.8448, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.5569518546926293, + "learning_rate": 0.00019909298684723904, + "loss": 0.815, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.5255991576983609, + "learning_rate": 0.00019906961737884077, + "loss": 0.8619, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.47846179606609723, + "learning_rate": 0.00019904595207352737, + "loss": 0.705, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.576217255091171, + "learning_rate": 0.00019902199100196697, + "loss": 0.9585, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.5919574449406261, + "learning_rate": 0.000198997734235711, + "loss": 0.9191, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.6072688062314806, + "learning_rate": 0.00019897318184719385, + "loss": 0.8699, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.597010946898004, + "learning_rate": 0.00019894833390973266, + "loss": 0.846, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.5809705626854813, + "learning_rate": 0.0001989231904975272, + "loss": 1.0107, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.5932886371768736, + "learning_rate": 0.00019889775168565943, + "loss": 0.934, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.6032409916519083, + "learning_rate": 0.00019887201755009357, + "loss": 0.9186, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.5457310796425244, + "learning_rate": 0.00019884598816767563, + "loss": 0.8617, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.476457612204842, + "learning_rate": 0.0001988196636161333, + "loss": 0.8619, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4799963625134939, + "learning_rate": 0.0001987930439740757, + "loss": 0.8132, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.5463824490629723, + "learning_rate": 0.00019876612932099308, + "loss": 0.904, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.58196447750242, + "learning_rate": 0.0001987389197372567, + "loss": 0.9932, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.4640382992323238, + "learning_rate": 0.00019871141530411853, + "loss": 0.7935, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.5719287398508892, + "learning_rate": 0.00019868361610371097, + "loss": 0.8877, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.5236602458110028, + "learning_rate": 0.00019865552221904665, + "loss": 0.8657, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.6509209711208326, + "learning_rate": 0.0001986271337340182, + "loss": 0.9885, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.5609187112002201, + "learning_rate": 0.00019859845073339787, + "loss": 0.8781, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.492495205615016, + "learning_rate": 0.00019856947330283752, + "loss": 0.8598, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6043981736009975, + "learning_rate": 0.00019854020152886814, + "loss": 0.8453, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.6610003850374823, + "learning_rate": 0.0001985106354988997, + "loss": 0.8916, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.4795860359494709, + "learning_rate": 0.00019848077530122083, + "loss": 0.7991, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.6337024654196615, + "learning_rate": 0.0001984506210249986, + "loss": 1.0085, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.4997461207024708, + "learning_rate": 0.00019842017276027832, + "loss": 0.7467, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.5286499908095579, + "learning_rate": 0.00019838943059798304, + "loss": 0.8765, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4704310497131496, + "learning_rate": 0.00019835839462991361, + "loss": 0.8461, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.5937567267938444, + "learning_rate": 0.0001983270649487481, + "loss": 0.8693, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.5371755014877978, + "learning_rate": 0.0001982954416480417, + "loss": 0.9063, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.5800336586723125, + "learning_rate": 0.00019826352482222638, + "loss": 0.9283, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.4958033989577926, + "learning_rate": 0.00019823131456661063, + "loss": 0.8362, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.5399180175782222, + "learning_rate": 0.00019819881097737915, + "loss": 0.9651, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5843106376237646, + "learning_rate": 0.00019816601415159263, + "loss": 0.9353, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.6848308749700495, + "learning_rate": 0.00019813292418718732, + "loss": 0.9304, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.4752685849578976, + "learning_rate": 0.0001980995411829749, + "loss": 0.8504, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5477850021150672, + "learning_rate": 0.0001980658652386421, + "loss": 0.9404, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.5181254404743765, + "learning_rate": 0.0001980318964547504, + "loss": 0.8639, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.6495017550231028, + "learning_rate": 0.0001979976349327357, + "loss": 0.9617, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.6380134101944767, + "learning_rate": 0.00019796308077490817, + "loss": 0.9427, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.5847268292402413, + "learning_rate": 0.00019792823408445174, + "loss": 0.9004, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.496999883623956, + "learning_rate": 0.0001978930949654239, + "loss": 0.8021, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.7499340485732703, + "learning_rate": 0.00019785766352275542, + "loss": 1.0304, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.6816349545708028, + "learning_rate": 0.00019782193986224995, + "loss": 1.0057, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.5705412165518122, + "learning_rate": 0.00019778592409058378, + "loss": 0.8076, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.8249649542063773, + "learning_rate": 0.00019774961631530545, + "loss": 1.0162, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.5405554579229593, + "learning_rate": 0.0001977130166448355, + "loss": 0.8356, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.6330093176321191, + "learning_rate": 0.00019767612518846608, + "loss": 0.9122, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.5222870952333465, + "learning_rate": 0.00019763894205636072, + "loss": 0.8313, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.6560197653961808, + "learning_rate": 0.00019760146735955388, + "loss": 0.8648, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.630258194763127, + "learning_rate": 0.00019756370120995066, + "loss": 0.9679, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.48276619536438653, + "learning_rate": 0.00019752564372032657, + "loss": 0.8696, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.693055832960493, + "learning_rate": 0.000197487295004327, + "loss": 0.9758, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.5404047908025655, + "learning_rate": 0.00019744865517646706, + "loss": 0.8055, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4932545259714465, + "learning_rate": 0.00019740972435213115, + "loss": 0.7993, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.6539678194350204, + "learning_rate": 0.0001973705026475726, + "loss": 0.9077, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.6892197304317585, + "learning_rate": 0.00019733099017991341, + "loss": 0.9111, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5834308356502489, + "learning_rate": 0.00019729118706714375, + "loss": 0.7855, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.5512081138368022, + "learning_rate": 0.0001972510934281218, + "loss": 0.9188, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.49902017114107877, + "learning_rate": 0.00019721070938257324, + "loss": 0.7896, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.5446308240927318, + "learning_rate": 0.00019717003505109095, + "loss": 0.8752, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.5547189415434525, + "learning_rate": 0.0001971290705551347, + "loss": 0.8857, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.5327935283971226, + "learning_rate": 0.00019708781601703065, + "loss": 0.8803, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.444289401959508, + "learning_rate": 0.00019704627155997108, + "loss": 0.6973, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.6306712871206122, + "learning_rate": 0.00019700443730801413, + "loss": 0.9123, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.5659955054675453, + "learning_rate": 0.00019696231338608316, + "loss": 0.8996, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.510245250500772, + "learning_rate": 0.00019691989991996663, + "loss": 0.9321, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.5614492231253495, + "learning_rate": 0.00019687719703631755, + "loss": 0.8889, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.5467141420862776, + "learning_rate": 0.00019683420486265327, + "loss": 0.823, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5093701579959352, + "learning_rate": 0.0001967909235273549, + "loss": 0.7595, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.5750194113830609, + "learning_rate": 0.0001967473531596671, + "loss": 0.8937, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.5673603349909078, + "learning_rate": 0.0001967034938896976, + "loss": 0.8523, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5895133750544301, + "learning_rate": 0.00019665934584841682, + "loss": 0.8462, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.7750206260510349, + "learning_rate": 0.0001966149091676575, + "loss": 0.9396, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.45157308135391144, + "learning_rate": 0.00019657018398011434, + "loss": 0.7762, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.5306904599961818, + "learning_rate": 0.00019652517041934356, + "loss": 0.8957, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.5196935041544928, + "learning_rate": 0.00019647986861976246, + "loss": 0.8466, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.551998514437722, + "learning_rate": 0.0001964342787166491, + "loss": 0.8251, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5231235609049675, + "learning_rate": 0.00019638840084614182, + "loss": 0.919, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.545005886462309, + "learning_rate": 0.0001963422351452389, + "loss": 0.8389, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.617161346116059, + "learning_rate": 0.0001962957817517982, + "loss": 0.8845, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.533965318790452, + "learning_rate": 0.00019624904080453655, + "loss": 0.8025, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.6120440370891225, + "learning_rate": 0.00019620201244302952, + "loss": 0.9859, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.5430165201923808, + "learning_rate": 0.00019615469680771096, + "loss": 0.8282, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.5189411745406076, + "learning_rate": 0.00019610709403987246, + "loss": 0.8734, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.5416218607957094, + "learning_rate": 0.00019605920428166323, + "loss": 0.9556, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.540844652707924, + "learning_rate": 0.00019601102767608923, + "loss": 0.9289, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.6125147723856628, + "learning_rate": 0.00019596256436701324, + "loss": 0.8679, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.5519197783369388, + "learning_rate": 0.00019591381449915397, + "loss": 0.8236, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.6667872057607972, + "learning_rate": 0.00019586477821808597, + "loss": 0.9513, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.6665613951386896, + "learning_rate": 0.000195815455670239, + "loss": 0.977, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.3899502563708877, + "learning_rate": 0.00019576584700289768, + "loss": 0.7104, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.5289508773870094, + "learning_rate": 0.00019571595236420102, + "loss": 0.8844, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5085346436930963, + "learning_rate": 0.00019566577190314197, + "loss": 0.8637, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.5272601306833273, + "learning_rate": 0.00019561530576956703, + "loss": 0.8849, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.6525105891946158, + "learning_rate": 0.00019556455411417573, + "loss": 0.8833, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.6234486059350676, + "learning_rate": 0.0001955135170885202, + "loss": 0.8809, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.5296725270937737, + "learning_rate": 0.00019546219484500475, + "loss": 0.8608, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.6297752187019564, + "learning_rate": 0.00019541058753688538, + "loss": 0.869, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5797725950739668, + "learning_rate": 0.00019535869531826937, + "loss": 0.9539, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.5322587181419928, + "learning_rate": 0.00019530651834411474, + "loss": 0.8305, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.574481904911282, + "learning_rate": 0.00019525405677022989, + "loss": 0.8696, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.4799510430555526, + "learning_rate": 0.00019520131075327298, + "loss": 0.7752, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.4547495523917592, + "learning_rate": 0.0001951482804507517, + "loss": 0.7918, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.5263436873112797, + "learning_rate": 0.00019509496602102252, + "loss": 0.8603, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.6252578470714629, + "learning_rate": 0.00019504136762329047, + "loss": 0.9067, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.4867148597600659, + "learning_rate": 0.00019498748541760846, + "loss": 0.9251, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.5290662373217285, + "learning_rate": 0.0001949333195648769, + "loss": 0.7274, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.5153066665623456, + "learning_rate": 0.00019487887022684336, + "loss": 0.9242, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.6625691460737685, + "learning_rate": 0.00019482413756610173, + "loss": 0.8166, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.670819395240693, + "learning_rate": 0.0001947691217460921, + "loss": 0.9512, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5151189803391039, + "learning_rate": 0.00019471382293110003, + "loss": 0.8542, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.6362609947941695, + "learning_rate": 0.00019465824128625617, + "loss": 0.9869, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.5341739523116258, + "learning_rate": 0.00019460237697753577, + "loss": 0.8713, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5080000356479262, + "learning_rate": 0.00019454623017175812, + "loss": 0.854, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.5532806823745345, + "learning_rate": 0.00019448980103658613, + "loss": 0.9342, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.5000267304985636, + "learning_rate": 0.0001944330897405257, + "loss": 0.892, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.7752667597486199, + "learning_rate": 0.00019437609645292546, + "loss": 0.8867, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.48210521426119607, + "learning_rate": 0.00019431882134397598, + "loss": 0.784, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.726766942591237, + "learning_rate": 0.00019426126458470936, + "loss": 1.0296, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.48997429914486446, + "learning_rate": 0.0001942034263469989, + "loss": 0.8225, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.6901249877257869, + "learning_rate": 0.00019414530680355837, + "loss": 0.9044, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.435930713686654, + "learning_rate": 0.00019408690612794148, + "loss": 0.7588, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5974265537076754, + "learning_rate": 0.00019402822449454153, + "loss": 0.9269, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.5342606116909296, + "learning_rate": 0.00019396926207859084, + "loss": 0.8764, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.49358763046628124, + "learning_rate": 0.0001939100190561601, + "loss": 0.8179, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.7336425072326883, + "learning_rate": 0.00019385049560415794, + "loss": 1.0536, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.5743919866990923, + "learning_rate": 0.0001937906919003304, + "loss": 0.9086, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.44425441099827134, + "learning_rate": 0.00019373060812326052, + "loss": 0.8629, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5164911733879413, + "learning_rate": 0.00019367024445236754, + "loss": 0.7725, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.46286711402137637, + "learning_rate": 0.00019360960106790643, + "loss": 0.769, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 1.399245238341527, + "learning_rate": 0.0001935486781509677, + "loss": 1.0842, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.5045294232571674, + "learning_rate": 0.00019348747588347637, + "loss": 0.8658, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.49840155683797577, + "learning_rate": 0.00019342599444819168, + "loss": 0.8284, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.7117131658267387, + "learning_rate": 0.00019336423402870653, + "loss": 0.8987, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.5128146847876454, + "learning_rate": 0.00019330219480944694, + "loss": 0.8484, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.7722720996758503, + "learning_rate": 0.0001932398769756714, + "loss": 1.1469, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.5803708158094779, + "learning_rate": 0.0001931772807134704, + "loss": 0.8368, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.42993346683667927, + "learning_rate": 0.00019311440620976597, + "loss": 0.8031, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.6065197949647673, + "learning_rate": 0.00019305125365231084, + "loss": 0.9472, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.6908488885342616, + "learning_rate": 0.00019298782322968815, + "loss": 0.9874, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5363581236326376, + "learning_rate": 0.0001929241151313108, + "loss": 0.847, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.558700883977016, + "learning_rate": 0.0001928601295474208, + "loss": 0.8182, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.5947475743975708, + "learning_rate": 0.00019279586666908884, + "loss": 0.9263, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.5380157540832908, + "learning_rate": 0.00019273132668821364, + "loss": 0.722, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.5538227064782307, + "learning_rate": 0.00019266650979752136, + "loss": 0.8307, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.4832783044469024, + "learning_rate": 0.00019260141619056507, + "loss": 0.8599, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5262719970151286, + "learning_rate": 0.00019253604606172417, + "loss": 0.8081, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.6223522228511364, + "learning_rate": 0.0001924703996062038, + "loss": 0.8736, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.6213342599722824, + "learning_rate": 0.0001924044770200342, + "loss": 0.9558, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.5561811603373654, + "learning_rate": 0.00019233827850007027, + "loss": 0.9001, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.7575560964349567, + "learning_rate": 0.0001922718042439908, + "loss": 1.0266, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.5184421232657922, + "learning_rate": 0.000192205054450298, + "loss": 0.8553, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5837734427615423, + "learning_rate": 0.00019213802931831696, + "loss": 0.8718, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.5577590220415555, + "learning_rate": 0.00019207072904819486, + "loss": 0.9378, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.45233445244978454, + "learning_rate": 0.00019200315384090044, + "loss": 0.7427, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.5558031950828102, + "learning_rate": 0.00019193530389822363, + "loss": 0.8903, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.5093962275712094, + "learning_rate": 0.00019186717942277462, + "loss": 0.8061, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.5050638192275981, + "learning_rate": 0.00019179878061798347, + "loss": 0.8559, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.6324928777374244, + "learning_rate": 0.00019173010768809933, + "loss": 1.0162, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.5195438049240261, + "learning_rate": 0.00019166116083819002, + "loss": 0.8277, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.4761289332277623, + "learning_rate": 0.00019159194027414128, + "loss": 0.8011, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.47188232850246264, + "learning_rate": 0.0001915224462026563, + "loss": 0.7454, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.5162299384722681, + "learning_rate": 0.00019145267883125482, + "loss": 0.8799, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.5238150633584967, + "learning_rate": 0.00019138263836827288, + "loss": 0.8069, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.46840724242613746, + "learning_rate": 0.00019131232502286188, + "loss": 0.7969, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.6269601229860577, + "learning_rate": 0.00019124173900498818, + "loss": 1.0077, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.5600203459620383, + "learning_rate": 0.00019117088052543233, + "loss": 0.9362, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.491478107722541, + "learning_rate": 0.0001910997497957885, + "loss": 0.8715, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.5271381347138833, + "learning_rate": 0.00019102834702846387, + "loss": 0.8597, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.4824130420607745, + "learning_rate": 0.0001909566724366779, + "loss": 0.8014, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5134450187561762, + "learning_rate": 0.00019088472623446183, + "loss": 0.8139, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.5823218443587672, + "learning_rate": 0.00019081250863665794, + "loss": 0.8517, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.6349330124167318, + "learning_rate": 0.0001907400198589189, + "loss": 0.9057, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.5295789449060487, + "learning_rate": 0.00019066726011770726, + "loss": 0.8535, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.4831281030968079, + "learning_rate": 0.00019059422963029464, + "loss": 0.8261, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.5910041302313082, + "learning_rate": 0.0001905209286147611, + "loss": 0.7828, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5392756185044956, + "learning_rate": 0.0001904473572899947, + "loss": 0.8113, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.557452760417172, + "learning_rate": 0.0001903735158756905, + "loss": 1.0161, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.522505954409253, + "learning_rate": 0.0001902994045923502, + "loss": 0.761, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.7233142924067804, + "learning_rate": 0.00019022502366128135, + "loss": 0.9351, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.4676711125844684, + "learning_rate": 0.0001901503733045967, + "loss": 0.7071, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.47685551246661534, + "learning_rate": 0.00019007545374521355, + "loss": 0.8177, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4533488930327886, + "learning_rate": 0.00019000026520685302, + "loss": 0.7436, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.4533058132862425, + "learning_rate": 0.00018992480791403958, + "loss": 0.7291, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.5458120540129042, + "learning_rate": 0.0001898490820921001, + "loss": 0.8137, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.5643466697441072, + "learning_rate": 0.0001897730879671634, + "loss": 0.8801, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.5380033786204588, + "learning_rate": 0.0001896968257661595, + "loss": 0.7599, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.617314497027414, + "learning_rate": 0.00018962029571681886, + "loss": 0.8547, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.6264038818277112, + "learning_rate": 0.00018954349804767184, + "loss": 0.8778, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.5846003637502903, + "learning_rate": 0.00018946643298804793, + "loss": 0.9112, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.47567665918466145, + "learning_rate": 0.00018938910076807513, + "loss": 0.8198, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.49196104288845344, + "learning_rate": 0.00018931150161867916, + "loss": 0.8299, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.5601486377476347, + "learning_rate": 0.0001892336357715829, + "loss": 0.8747, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.43468064391038164, + "learning_rate": 0.0001891555034593055, + "loss": 0.7781, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 1.028194014871719, + "learning_rate": 0.00018907710491516199, + "loss": 0.991, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.627738619519797, + "learning_rate": 0.00018899844037326225, + "loss": 0.8718, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.47026200802077206, + "learning_rate": 0.0001889195100685106, + "loss": 0.8163, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.5361118321266511, + "learning_rate": 0.0001888403142366049, + "loss": 0.8452, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.5993619983899835, + "learning_rate": 0.00018876085311403593, + "loss": 0.942, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.46253089860484015, + "learning_rate": 0.00018868112693808665, + "loss": 0.8561, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5110368561327658, + "learning_rate": 0.00018860113594683148, + "loss": 0.7982, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.45178298535160205, + "learning_rate": 0.00018852088037913577, + "loss": 0.7952, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.6283455503137422, + "learning_rate": 0.0001884403604746547, + "loss": 0.9345, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.41016500264183564, + "learning_rate": 0.00018835957647383303, + "loss": 0.7344, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.5778912728374959, + "learning_rate": 0.00018827852861790398, + "loss": 0.9244, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.5307862089180293, + "learning_rate": 0.00018819721714888877, + "loss": 0.7725, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5408782628800713, + "learning_rate": 0.00018811564230959588, + "loss": 0.8191, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.5343495537158315, + "learning_rate": 0.00018803380434362, + "loss": 0.9992, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.4629573452278619, + "learning_rate": 0.0001879517034953418, + "loss": 0.7467, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.5432787902120778, + "learning_rate": 0.00018786934000992688, + "loss": 0.9322, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.46361653410119125, + "learning_rate": 0.00018778671413332513, + "loss": 0.7921, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.7190701351188497, + "learning_rate": 0.00018770382611226987, + "loss": 0.9846, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5165758662612719, + "learning_rate": 0.00018762067619427746, + "loss": 0.8487, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.6198398172150162, + "learning_rate": 0.000187537264627646, + "loss": 0.8864, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.5285512826336245, + "learning_rate": 0.00018745359166145523, + "loss": 0.8763, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.42007644809898664, + "learning_rate": 0.00018736965754556528, + "loss": 0.7347, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.5828585755855576, + "learning_rate": 0.00018728546253061614, + "loss": 0.8223, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.42651129826500506, + "learning_rate": 0.00018720100686802694, + "loss": 0.7122, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5797530839376076, + "learning_rate": 0.00018711629080999504, + "loss": 0.9591, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.544495478635436, + "learning_rate": 0.00018703131460949554, + "loss": 0.913, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.4432574119212112, + "learning_rate": 0.0001869460785202802, + "loss": 0.752, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4798537314874882, + "learning_rate": 0.00018686058279687698, + "loss": 0.9229, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.581634013330689, + "learning_rate": 0.00018677482769458904, + "loss": 0.9095, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.6178175001226105, + "learning_rate": 0.00018668881346949417, + "loss": 0.8316, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.5503240721251285, + "learning_rate": 0.00018660254037844388, + "loss": 0.8607, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.4425801376441001, + "learning_rate": 0.00018651600867906272, + "loss": 0.8211, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.539226309972477, + "learning_rate": 0.00018642921862974742, + "loss": 0.7202, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.5453302685171948, + "learning_rate": 0.00018634217048966637, + "loss": 0.8175, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.4884968861852387, + "learning_rate": 0.00018625486451875843, + "loss": 0.8352, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.596533126573078, + "learning_rate": 0.0001861673009777325, + "loss": 0.9176, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5116158046966843, + "learning_rate": 0.0001860794801280666, + "loss": 0.8144, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.5085625162646913, + "learning_rate": 0.00018599140223200716, + "loss": 0.7584, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.6091533276019632, + "learning_rate": 0.0001859030675525681, + "loss": 0.7884, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5443685151140606, + "learning_rate": 0.0001858144763535302, + "loss": 0.9256, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.5236325125412632, + "learning_rate": 0.0001857256288994402, + "loss": 0.8713, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.5867356801037134, + "learning_rate": 0.00018563652545561013, + "loss": 0.8452, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.5095813956183218, + "learning_rate": 0.0001855471662881164, + "loss": 0.7325, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.6069538436884427, + "learning_rate": 0.000185457551663799, + "loss": 0.8661, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.40768137417188727, + "learning_rate": 0.00018536768185026083, + "loss": 0.7727, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.633628916149109, + "learning_rate": 0.00018527755711586678, + "loss": 0.9909, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.4957646817286244, + "learning_rate": 0.00018518717772974302, + "loss": 0.8159, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.47105987995879206, + "learning_rate": 0.00018509654396177609, + "loss": 0.7986, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.6163522938264158, + "learning_rate": 0.00018500565608261214, + "loss": 0.9178, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.5832180274336531, + "learning_rate": 0.00018491451436365627, + "loss": 0.8922, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.509935329197866, + "learning_rate": 0.0001848231190770714, + "loss": 0.816, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.669176687285736, + "learning_rate": 0.00018473147049577774, + "loss": 0.8657, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.4752882620373262, + "learning_rate": 0.00018463956889345194, + "loss": 0.8092, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.7051297197996066, + "learning_rate": 0.00018454741454452603, + "loss": 0.8506, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5577527025167552, + "learning_rate": 0.00018445500772418697, + "loss": 0.8511, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.4139542702196724, + "learning_rate": 0.00018436234870837547, + "loss": 0.6828, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.5182027854667481, + "learning_rate": 0.00018426943777378552, + "loss": 0.8287, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4299886279907164, + "learning_rate": 0.00018417627519786315, + "loss": 0.7708, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.5143149967352305, + "learning_rate": 0.00018408286125880604, + "loss": 0.8828, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.6078729480646635, + "learning_rate": 0.00018398919623556238, + "loss": 0.7933, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.5517572170783533, + "learning_rate": 0.00018389528040783012, + "loss": 0.8796, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.48253470214967964, + "learning_rate": 0.0001838011140560562, + "loss": 0.7825, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.5577749832898266, + "learning_rate": 0.00018370669746143564, + "loss": 0.8553, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5226852442696008, + "learning_rate": 0.00018361203090591071, + "loss": 0.8066, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.5367652669016284, + "learning_rate": 0.0001835171146721701, + "loss": 0.8055, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.5403403005109394, + "learning_rate": 0.00018342194904364813, + "loss": 0.8121, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.48584158907093455, + "learning_rate": 0.00018332653430452376, + "loss": 0.7834, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.499235660344172, + "learning_rate": 0.00018323087073971993, + "loss": 0.7473, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.5367712416434521, + "learning_rate": 0.00018313495863490258, + "loss": 0.8446, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.5824307197166317, + "learning_rate": 0.00018303879827647975, + "loss": 0.9132, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.4978245923360915, + "learning_rate": 0.00018294238995160094, + "loss": 0.8089, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.5328348890517718, + "learning_rate": 0.00018284573394815597, + "loss": 0.8156, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.5082485708902648, + "learning_rate": 0.00018274883055477436, + "loss": 0.7834, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.45628945910244506, + "learning_rate": 0.00018265168006082437, + "loss": 0.7999, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4163463024035274, + "learning_rate": 0.00018255428275641214, + "loss": 0.7293, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.5061406032370239, + "learning_rate": 0.00018245663893238075, + "loss": 0.8125, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.6363393218642723, + "learning_rate": 0.0001823587488803095, + "loss": 0.9316, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.5504945090482256, + "learning_rate": 0.00018226061289251298, + "loss": 0.8154, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5391672309228815, + "learning_rate": 0.00018216223126204007, + "loss": 0.8646, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.4793822093382232, + "learning_rate": 0.00018206360428267332, + "loss": 0.8117, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.5498770917213628, + "learning_rate": 0.00018196473224892784, + "loss": 0.8909, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.5654925398384275, + "learning_rate": 0.00018186561545605054, + "loss": 0.9082, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.5483893173014566, + "learning_rate": 0.0001817662542000192, + "loss": 0.9328, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.5094528950078349, + "learning_rate": 0.0001816666487775416, + "loss": 0.751, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.8131714559962321, + "learning_rate": 0.00018156679948605467, + "loss": 0.9416, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.5625225241486854, + "learning_rate": 0.00018146670662372354, + "loss": 0.8736, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.619670441403755, + "learning_rate": 0.0001813663704894407, + "loss": 0.8054, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.520660453368968, + "learning_rate": 0.00018126579138282503, + "loss": 0.8804, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.5590866916233668, + "learning_rate": 0.00018116496960422107, + "loss": 0.8684, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 2.7958795352687495, + "learning_rate": 0.00018106390545469795, + "loss": 0.812, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.5850438335172121, + "learning_rate": 0.0001809625992360485, + "loss": 0.8854, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.5500640216510291, + "learning_rate": 0.00018086105125078857, + "loss": 0.8491, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.45447930553133686, + "learning_rate": 0.00018075926180215576, + "loss": 0.7655, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.7192110122318098, + "learning_rate": 0.00018065723119410884, + "loss": 0.9129, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.5894740072945653, + "learning_rate": 0.0001805549597313267, + "loss": 0.9886, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.586829510685889, + "learning_rate": 0.0001804524477192075, + "loss": 0.9216, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.5177084599142368, + "learning_rate": 0.00018034969546386757, + "loss": 0.8435, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.5747541873884852, + "learning_rate": 0.00018024670327214084, + "loss": 0.8624, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.6199485085369928, + "learning_rate": 0.00018014347145157755, + "loss": 0.878, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.5666686132982203, + "learning_rate": 0.0001800400003104436, + "loss": 0.8008, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.5010951768121458, + "learning_rate": 0.0001799362901577196, + "loss": 0.87, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.5081946802813396, + "learning_rate": 0.00017983234130309968, + "loss": 0.8655, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.46053639541147723, + "learning_rate": 0.00017972815405699103, + "loss": 0.7879, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.6588624487113882, + "learning_rate": 0.00017962372873051252, + "loss": 0.9208, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.434622738600863, + "learning_rate": 0.00017951906563549397, + "loss": 0.6974, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.6294823997970527, + "learning_rate": 0.00017941416508447536, + "loss": 0.892, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.48523784665379055, + "learning_rate": 0.00017930902739070562, + "loss": 0.7763, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.5373909243025393, + "learning_rate": 0.00017920365286814183, + "loss": 0.8374, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.44329198405302284, + "learning_rate": 0.0001790980418314484, + "loss": 0.7473, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.48584097444788127, + "learning_rate": 0.0001789921945959958, + "loss": 0.8479, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.49974502928648723, + "learning_rate": 0.00017888611147786002, + "loss": 0.823, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5026833029263131, + "learning_rate": 0.00017877979279382135, + "loss": 0.7422, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.5307811503098481, + "learning_rate": 0.00017867323886136348, + "loss": 0.7479, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.5306542674659223, + "learning_rate": 0.00017856644999867264, + "loss": 0.8433, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5209472676621327, + "learning_rate": 0.0001784594265246366, + "loss": 0.7319, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.6062715684433185, + "learning_rate": 0.00017835216875884368, + "loss": 0.8592, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.45574144099355285, + "learning_rate": 0.0001782446770215819, + "loss": 0.7948, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.47381837609008487, + "learning_rate": 0.0001781369516338378, + "loss": 0.7444, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.49309845559709176, + "learning_rate": 0.00017802899291729585, + "loss": 0.7693, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.5168908058408266, + "learning_rate": 0.0001779208011943371, + "loss": 0.74, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.5738186976667065, + "learning_rate": 0.00017781237678803847, + "loss": 0.9781, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.49598478169438215, + "learning_rate": 0.00017770372002217172, + "loss": 0.7199, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.7236920935018343, + "learning_rate": 0.00017759483122120238, + "loss": 0.9363, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.41008240762151804, + "learning_rate": 0.000177485710710289, + "loss": 0.7539, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.5116016775806168, + "learning_rate": 0.00017737635881528196, + "loss": 0.8255, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.729540440321037, + "learning_rate": 0.00017726677586272263, + "loss": 0.9091, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5939518708163508, + "learning_rate": 0.00017715696217984235, + "loss": 0.8069, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.47339713598201816, + "learning_rate": 0.00017704691809456143, + "loss": 0.7996, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.6182943082393394, + "learning_rate": 0.0001769366439354882, + "loss": 0.8422, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.4956243203684921, + "learning_rate": 0.00017682614003191807, + "loss": 0.8494, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.508301548382395, + "learning_rate": 0.00017671540671383243, + "loss": 0.8665, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.5509188319582408, + "learning_rate": 0.0001766044443118978, + "loss": 0.883, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.548518166698976, + "learning_rate": 0.00017649325315746478, + "loss": 0.8754, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.5297762745417619, + "learning_rate": 0.00017638183358256696, + "loss": 0.8727, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.6384739843727559, + "learning_rate": 0.00017627018591992018, + "loss": 0.7691, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.5796092277488748, + "learning_rate": 0.0001761583105029213, + "loss": 0.8753, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.4480463427628124, + "learning_rate": 0.00017604620766564723, + "loss": 0.7904, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.7029097235032447, + "learning_rate": 0.00017593387774285412, + "loss": 0.9765, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.6096410772724791, + "learning_rate": 0.00017582132106997616, + "loss": 0.7711, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.6376346604968903, + "learning_rate": 0.0001757085379831246, + "loss": 1.021, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.7241821384453352, + "learning_rate": 0.00017559552881908695, + "loss": 0.8957, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4989190729708759, + "learning_rate": 0.00017548229391532572, + "loss": 0.8056, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.4224851417231786, + "learning_rate": 0.00017536883360997743, + "loss": 0.7753, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.5785550065502728, + "learning_rate": 0.00017525514824185185, + "loss": 0.8712, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.5346221293744353, + "learning_rate": 0.00017514123815043074, + "loss": 0.9194, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.4976250533874276, + "learning_rate": 0.00017502710367586687, + "loss": 0.8288, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.5605203286789534, + "learning_rate": 0.0001749127451589832, + "loss": 0.8594, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.5002828789623779, + "learning_rate": 0.00017479816294127152, + "loss": 0.7608, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.5440527531770649, + "learning_rate": 0.00017468335736489177, + "loss": 0.8472, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.473801147894544, + "learning_rate": 0.00017456832877267084, + "loss": 0.7923, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.4472347584396617, + "learning_rate": 0.0001744530775081015, + "loss": 0.8068, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.8784206403575792, + "learning_rate": 0.00017433760391534167, + "loss": 1.0083, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.5100070260977737, + "learning_rate": 0.00017422190833921283, + "loss": 0.8667, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5209546159346587, + "learning_rate": 0.0001741059911251997, + "loss": 0.7523, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.8013445926468988, + "learning_rate": 0.00017398985261944856, + "loss": 0.9211, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.6181769629664103, + "learning_rate": 0.00017387349316876666, + "loss": 0.866, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5384962948779574, + "learning_rate": 0.000173756913120621, + "loss": 0.8699, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.4955773239230858, + "learning_rate": 0.0001736401128231373, + "loss": 0.7785, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.5699962373055503, + "learning_rate": 0.00017352309262509894, + "loss": 0.8766, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4874255045850295, + "learning_rate": 0.00017340585287594604, + "loss": 0.8068, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.4963726627949967, + "learning_rate": 0.0001732883939257742, + "loss": 0.8507, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.5182793668484437, + "learning_rate": 0.0001731707161253338, + "loss": 0.7986, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5967711305347155, + "learning_rate": 0.0001730528198260285, + "loss": 0.9506, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.48206999659371763, + "learning_rate": 0.00017293470537991463, + "loss": 0.8616, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.593173402003062, + "learning_rate": 0.00017281637313969978, + "loss": 0.8472, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.52689705134063, + "learning_rate": 0.00017269782345874203, + "loss": 0.8237, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.40518253160702855, + "learning_rate": 0.00017257905669104874, + "loss": 0.6664, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.40625602545500733, + "learning_rate": 0.00017246007319127545, + "loss": 0.6755, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4705409099200104, + "learning_rate": 0.00017234087331472497, + "loss": 0.7428, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.6227074667393875, + "learning_rate": 0.00017222145741734626, + "loss": 0.923, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.49906766278188625, + "learning_rate": 0.00017210182585573327, + "loss": 0.8099, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.5369831859736955, + "learning_rate": 0.00017198197898712404, + "loss": 0.849, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.5758348686795218, + "learning_rate": 0.00017186191716939944, + "loss": 0.872, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.4432309288935212, + "learning_rate": 0.0001717416407610824, + "loss": 0.7717, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5423226988590671, + "learning_rate": 0.00017162115012133643, + "loss": 0.8917, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.4924505451254493, + "learning_rate": 0.00017150044560996488, + "loss": 0.7686, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.45580850054236843, + "learning_rate": 0.00017137952758740978, + "loss": 0.8152, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5358146105236021, + "learning_rate": 0.00017125839641475072, + "loss": 0.8728, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.49970281242685627, + "learning_rate": 0.00017113705245370368, + "loss": 0.8341, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.5905759371849212, + "learning_rate": 0.00017101549606662024, + "loss": 1.0342, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.5811480896043241, + "learning_rate": 0.00017089372761648616, + "loss": 0.7844, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.5151835007482878, + "learning_rate": 0.00017077174746692056, + "loss": 0.7968, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.5059531882182916, + "learning_rate": 0.00017064955598217462, + "loss": 0.8128, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4574818857154386, + "learning_rate": 0.00017052715352713075, + "loss": 0.7983, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.47905678836689186, + "learning_rate": 0.00017040454046730115, + "loss": 0.8213, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.6704893566972991, + "learning_rate": 0.00017028171716882714, + "loss": 1.0397, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4277287971472794, + "learning_rate": 0.00017015868399847768, + "loss": 0.7041, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.7264039727777464, + "learning_rate": 0.00017003544132364846, + "loss": 0.8519, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.4691708986843132, + "learning_rate": 0.00016991198951236088, + "loss": 0.8379, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.6577844600409193, + "learning_rate": 0.00016978832893326074, + "loss": 0.9765, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.5412708227465047, + "learning_rate": 0.00016966445995561727, + "loss": 0.7459, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.4638379143755818, + "learning_rate": 0.00016954038294932216, + "loss": 0.7704, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5719164794068985, + "learning_rate": 0.00016941609828488807, + "loss": 0.8899, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.44866976447629964, + "learning_rate": 0.0001692916063334479, + "loss": 0.7286, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.5435761773073292, + "learning_rate": 0.0001691669074667535, + "loss": 0.8717, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.5419190131475101, + "learning_rate": 0.0001690420020571747, + "loss": 0.8028, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.5653884038342555, + "learning_rate": 0.0001689168904776979, + "loss": 0.8189, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.47985429486474734, + "learning_rate": 0.00016879157310192535, + "loss": 0.8004, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.45725501060587703, + "learning_rate": 0.0001686660503040737, + "loss": 0.7343, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.7636790180687056, + "learning_rate": 0.00016854032245897308, + "loss": 0.9407, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.6454673200210016, + "learning_rate": 0.00016841438994206595, + "loss": 0.9414, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.5018566675451702, + "learning_rate": 0.00016828825312940592, + "loss": 0.8254, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.5313927747180329, + "learning_rate": 0.00016816191239765667, + "loss": 0.8057, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.5257917504923237, + "learning_rate": 0.00016803536812409075, + "loss": 0.7753, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5240194109151427, + "learning_rate": 0.0001679086206865886, + "loss": 0.7936, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.535866550069367, + "learning_rate": 0.00016778167046363734, + "loss": 0.8626, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.48948103579799007, + "learning_rate": 0.00016765451783432953, + "loss": 0.796, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.47524504416647967, + "learning_rate": 0.00016752716317836229, + "loss": 0.8374, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.5012843780074685, + "learning_rate": 0.0001673996068760359, + "loss": 0.7437, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.5262813096929368, + "learning_rate": 0.00016727184930825288, + "loss": 0.8007, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.47653902866981207, + "learning_rate": 0.0001671438908565167, + "loss": 0.8214, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.5172011787522601, + "learning_rate": 0.00016701573190293077, + "loss": 0.8412, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.5149235899195433, + "learning_rate": 0.00016688737283019706, + "loss": 0.8536, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4427009159931128, + "learning_rate": 0.00016675881402161536, + "loss": 0.759, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.5292275381933234, + "learning_rate": 0.00016663005586108176, + "loss": 0.8827, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.582521020164629, + "learning_rate": 0.00016650109873308765, + "loss": 0.8743, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5069519148688234, + "learning_rate": 0.0001663719430227186, + "loss": 0.8831, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.48385860192439323, + "learning_rate": 0.0001662425891156531, + "loss": 0.8062, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.4852897385486229, + "learning_rate": 0.00016611303739816168, + "loss": 0.7802, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.42645874053163363, + "learning_rate": 0.00016598328825710533, + "loss": 0.7051, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.49064710613686185, + "learning_rate": 0.00016585334207993476, + "loss": 0.7964, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.5401695976111952, + "learning_rate": 0.00016572319925468892, + "loss": 0.8259, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5375868692082563, + "learning_rate": 0.000165592860169994, + "loss": 0.829, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.5811058934400566, + "learning_rate": 0.0001654623252150624, + "loss": 0.7528, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.5069164637571755, + "learning_rate": 0.00016533159477969122, + "loss": 0.812, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.5728008304872453, + "learning_rate": 0.00016520066925426144, + "loss": 0.8032, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.4091892462358017, + "learning_rate": 0.00016506954902973655, + "loss": 0.7931, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.4679162706641364, + "learning_rate": 0.00016493823449766136, + "loss": 0.8033, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.5264838841353725, + "learning_rate": 0.0001648067260501611, + "loss": 0.7719, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.7123429880382754, + "learning_rate": 0.00016467502407993992, + "loss": 0.8529, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.5137572290704788, + "learning_rate": 0.0001645431289802799, + "loss": 0.8076, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.5455463754821794, + "learning_rate": 0.0001644110411450398, + "loss": 0.8327, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.5074806693324961, + "learning_rate": 0.00016427876096865394, + "loss": 0.8404, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.5149714849884053, + "learning_rate": 0.00016414628884613107, + "loss": 0.8242, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.5287866259312569, + "learning_rate": 0.00016401362517305296, + "loss": 0.7885, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.4810071947579147, + "learning_rate": 0.00016388077034557355, + "loss": 0.7658, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.5921198116749884, + "learning_rate": 0.00016374772476041748, + "loss": 0.8433, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.48809684178386664, + "learning_rate": 0.00016361448881487914, + "loss": 0.7758, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.4838177625207053, + "learning_rate": 0.00016348106290682118, + "loss": 0.7975, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.5130253612052892, + "learning_rate": 0.00016334744743467364, + "loss": 0.7705, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.47539445596553065, + "learning_rate": 0.00016321364279743266, + "loss": 0.7383, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.4787554596238986, + "learning_rate": 0.00016307964939465914, + "loss": 0.7523, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.5674834308198589, + "learning_rate": 0.00016294546762647775, + "loss": 0.7946, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.45646133783305565, + "learning_rate": 0.0001628110978935756, + "loss": 0.7657, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.5614694326739641, + "learning_rate": 0.0001626765405972011, + "loss": 0.8507, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.5014240113047879, + "learning_rate": 0.00016254179613916278, + "loss": 0.7218, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4472389676560879, + "learning_rate": 0.00016240686492182804, + "loss": 0.804, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.6064763208584959, + "learning_rate": 0.000162271747348122, + "loss": 0.8987, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.7119278693007127, + "learning_rate": 0.0001621364438215262, + "loss": 0.9979, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.5026387345978868, + "learning_rate": 0.00016200095474607753, + "loss": 0.7743, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.5526951799907838, + "learning_rate": 0.00016186528052636692, + "loss": 0.8137, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.4927055292327108, + "learning_rate": 0.0001617294215675382, + "loss": 0.8203, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.48074662552300734, + "learning_rate": 0.00016159337827528685, + "loss": 0.7488, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.6877010020255859, + "learning_rate": 0.0001614571510558588, + "loss": 0.9138, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.4313484746703689, + "learning_rate": 0.00016132074031604917, + "loss": 0.7364, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.47636978018532844, + "learning_rate": 0.0001611841464632011, + "loss": 0.7957, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.5055420311446571, + "learning_rate": 0.00016104736990520468, + "loss": 0.8562, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.5531988181381383, + "learning_rate": 0.0001609104110504954, + "loss": 0.8923, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4344709603231299, + "learning_rate": 0.0001607732703080532, + "loss": 0.7367, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.5813064802188114, + "learning_rate": 0.00016063594808740113, + "loss": 0.9447, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.5993626462305357, + "learning_rate": 0.00016049844479860422, + "loss": 0.9174, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.5772228866511909, + "learning_rate": 0.00016036076085226814, + "loss": 0.8579, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.4990727842899779, + "learning_rate": 0.00016022289665953808, + "loss": 0.7831, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.43021107184507557, + "learning_rate": 0.00016008485263209742, + "loss": 0.6802, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5985180624025509, + "learning_rate": 0.0001599466291821666, + "loss": 0.819, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.49020502952037, + "learning_rate": 0.0001598082267225018, + "loss": 0.7572, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.5184831439018202, + "learning_rate": 0.0001596696456663938, + "loss": 0.833, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.49554259894650116, + "learning_rate": 0.0001595308864276666, + "loss": 0.7604, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.4433760973357246, + "learning_rate": 0.00015939194942067646, + "loss": 0.7036, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.526481275834099, + "learning_rate": 0.0001592528350603103, + "loss": 0.8491, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.47061940544784364, + "learning_rate": 0.0001591135437619847, + "loss": 0.861, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.5934063587664357, + "learning_rate": 0.00015897407594164467, + "loss": 0.8761, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.6163230108439915, + "learning_rate": 0.00015883443201576225, + "loss": 0.852, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.3378322500199042, + "learning_rate": 0.0001586946124013354, + "loss": 0.5935, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.4232325695110825, + "learning_rate": 0.00015855461751588677, + "loss": 0.7599, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.4970253850681227, + "learning_rate": 0.0001584144477774623, + "loss": 0.8726, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.45270525084655294, + "learning_rate": 0.0001582741036046301, + "loss": 0.7714, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.5344002172985072, + "learning_rate": 0.00015813358541647915, + "loss": 0.8408, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.756368936702217, + "learning_rate": 0.00015799289363261813, + "loss": 1.0281, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.5285174953413889, + "learning_rate": 0.00015785202867317407, + "loss": 0.8283, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.5411347592986606, + "learning_rate": 0.00015771099095879108, + "loss": 0.8734, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.4110227441690126, + "learning_rate": 0.0001575697809106292, + "loss": 0.7238, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.6084585821051274, + "learning_rate": 0.00015742839895036305, + "loss": 0.8188, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.5395012175676103, + "learning_rate": 0.00015728684550018064, + "loss": 0.8572, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.5992401267533382, + "learning_rate": 0.0001571451209827821, + "loss": 0.8908, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.43236326156039784, + "learning_rate": 0.00015700322582137827, + "loss": 0.7862, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.4946043648550045, + "learning_rate": 0.00015686116043968972, + "loss": 0.7476, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.5184618203964196, + "learning_rate": 0.00015671892526194516, + "loss": 0.779, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.49856805007399935, + "learning_rate": 0.0001565765207128805, + "loss": 0.7748, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.4223495185880146, + "learning_rate": 0.0001564339472177373, + "loss": 0.7711, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.46328799644070096, + "learning_rate": 0.00015629120520226165, + "loss": 0.773, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.596285576103134, + "learning_rate": 0.0001561482950927029, + "loss": 0.87, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.5081533212758815, + "learning_rate": 0.0001560052173158123, + "loss": 0.8247, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.5975594987336019, + "learning_rate": 0.00015586197229884184, + "loss": 0.7307, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.49422845682486166, + "learning_rate": 0.00015571856046954285, + "loss": 0.9156, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.48969307550494445, + "learning_rate": 0.00015557498225616487, + "loss": 0.8016, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.44535271675758226, + "learning_rate": 0.0001554312380874542, + "loss": 0.7971, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.41569141825306105, + "learning_rate": 0.00015528732839265272, + "loss": 0.7058, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.4442526091486681, + "learning_rate": 0.00015514325360149668, + "loss": 0.756, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.43241219268770614, + "learning_rate": 0.0001549990141442153, + "loss": 0.7774, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.551195864295374, + "learning_rate": 0.0001548546104515294, + "loss": 0.867, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.3704138442913533, + "learning_rate": 0.00015471004295465035, + "loss": 0.669, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.42772476117022945, + "learning_rate": 0.0001545653120852787, + "loss": 0.8204, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5019012824057236, + "learning_rate": 0.00015442041827560274, + "loss": 0.7893, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.5462200393424741, + "learning_rate": 0.00015427536195829742, + "loss": 0.8098, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.4607324445006312, + "learning_rate": 0.00015413014356652286, + "loss": 0.8057, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.8083059211716902, + "learning_rate": 0.00015398476353392323, + "loss": 1.0064, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.6487825918852744, + "learning_rate": 0.00015383922229462549, + "loss": 0.9977, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.5888023885057454, + "learning_rate": 0.00015369352028323774, + "loss": 0.8519, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.469175184433065, + "learning_rate": 0.00015354765793484834, + "loss": 0.7438, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.5278850865079271, + "learning_rate": 0.0001534016356850244, + "loss": 0.8291, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.51675783823404, + "learning_rate": 0.0001532554539698105, + "loss": 0.7848, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.7178680876928034, + "learning_rate": 0.00015310911322572753, + "loss": 0.9272, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.517804814528721, + "learning_rate": 0.00015296261388977108, + "loss": 0.7646, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.47377186429963813, + "learning_rate": 0.0001528159563994104, + "loss": 0.689, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.4875184855956417, + "learning_rate": 0.000152669141192587, + "loss": 0.7916, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.7105916731272629, + "learning_rate": 0.00015252216870771345, + "loss": 0.8881, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.742048351965138, + "learning_rate": 0.00015237503938367186, + "loss": 0.8945, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.6408990406108908, + "learning_rate": 0.00015222775365981273, + "loss": 0.9067, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.4579451451060999, + "learning_rate": 0.00015208031197595356, + "loss": 0.761, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.5375881455685046, + "learning_rate": 0.0001519327147723776, + "loss": 0.774, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.5690127553057223, + "learning_rate": 0.00015178496248983254, + "loss": 0.8267, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.5525921666015885, + "learning_rate": 0.0001516370555695291, + "loss": 0.8956, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.43057917172296345, + "learning_rate": 0.00015148899445313981, + "loss": 0.6629, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.5369851823892494, + "learning_rate": 0.00015134077958279765, + "loss": 0.8473, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.4421763523413578, + "learning_rate": 0.00015119241140109467, + "loss": 0.7547, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.6683809588214158, + "learning_rate": 0.00015104389035108077, + "loss": 0.8685, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.424323287318125, + "learning_rate": 0.00015089521687626243, + "loss": 0.7535, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.7285451043404508, + "learning_rate": 0.0001507463914206012, + "loss": 0.9519, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.48670522315608944, + "learning_rate": 0.0001505974144285124, + "loss": 0.7798, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.5148457431572767, + "learning_rate": 0.000150448286344864, + "loss": 0.8448, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.46087000378455195, + "learning_rate": 0.00015029900761497506, + "loss": 0.8342, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.5310656086371572, + "learning_rate": 0.00015014957868461458, + "loss": 0.8356, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.5934194459135458, + "learning_rate": 0.00015000000000000001, + "loss": 0.9335, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.5346786798201316, + "learning_rate": 0.000149850272007796, + "loss": 0.7713, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.58607560514104, + "learning_rate": 0.00014970039515511304, + "loss": 0.9632, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.609226022433584, + "learning_rate": 0.00014955036988950618, + "loss": 0.7857, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.5858962119631774, + "learning_rate": 0.0001494001966589736, + "loss": 0.8422, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.5898803646397386, + "learning_rate": 0.00014924987591195547, + "loss": 0.8418, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4517854783386305, + "learning_rate": 0.00014909940809733222, + "loss": 0.7254, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.5345428463035593, + "learning_rate": 0.0001489487936644237, + "loss": 0.8547, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.5636075683541205, + "learning_rate": 0.00014879803306298736, + "loss": 0.8664, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.530351789134002, + "learning_rate": 0.00014864712674321734, + "loss": 0.68, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.6160935335239659, + "learning_rate": 0.00014849607515574276, + "loss": 0.9196, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.5643390099503108, + "learning_rate": 0.00014834487875162657, + "loss": 0.9024, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.4807419538502926, + "learning_rate": 0.00014819353798236427, + "loss": 0.7408, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.5531256861760592, + "learning_rate": 0.00014804205329988225, + "loss": 0.8567, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.4718076265230407, + "learning_rate": 0.00014789042515653687, + "loss": 0.7591, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4149949168385742, + "learning_rate": 0.00014773865400511272, + "loss": 0.7399, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.5035941906366352, + "learning_rate": 0.00014758674029882152, + "loss": 0.8239, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.5349329534928783, + "learning_rate": 0.00014743468449130063, + "loss": 0.8727, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.5070941608741568, + "learning_rate": 0.00014728248703661182, + "loss": 0.8018, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.49216670496778525, + "learning_rate": 0.00014713014838923976, + "loss": 0.7589, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.5761438336408812, + "learning_rate": 0.00014697766900409074, + "loss": 0.9939, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.46972496509699546, + "learning_rate": 0.00014682504933649144, + "loss": 0.7172, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.4941160902091317, + "learning_rate": 0.0001466722898421873, + "loss": 0.8086, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.5550066709819924, + "learning_rate": 0.0001465193909773413, + "loss": 0.8026, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.6072781403801273, + "learning_rate": 0.00014636635319853275, + "loss": 0.8696, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.5615374399243932, + "learning_rate": 0.00014621317696275564, + "loss": 0.8344, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.5570377881327896, + "learning_rate": 0.00014605986272741748, + "loss": 0.7939, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.524379630946479, + "learning_rate": 0.00014590641095033787, + "loss": 0.7864, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.571881857654052, + "learning_rate": 0.00014575282208974702, + "loss": 0.8803, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.4326176104183051, + "learning_rate": 0.00014559909660428468, + "loss": 0.7871, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.5186171184185153, + "learning_rate": 0.00014544523495299842, + "loss": 0.7266, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.596725330052778, + "learning_rate": 0.00014529123759534255, + "loss": 0.8192, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.5459542972673567, + "learning_rate": 0.00014513710499117647, + "loss": 0.8215, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.42351102934213064, + "learning_rate": 0.0001449828376007636, + "loss": 0.7529, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.5304552971293983, + "learning_rate": 0.00014482843588476974, + "loss": 0.866, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.4713690466074876, + "learning_rate": 0.00014467390030426186, + "loss": 0.7434, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.49382378877514266, + "learning_rate": 0.0001445192313207067, + "loss": 0.8755, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.45454409178905214, + "learning_rate": 0.0001443644293959693, + "loss": 0.784, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.5601592151502356, + "learning_rate": 0.00014420949499231172, + "loss": 0.8895, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.52555648303496, + "learning_rate": 0.0001440544285723915, + "loss": 0.8281, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.644756954068664, + "learning_rate": 0.00014389923059926062, + "loss": 0.9863, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.6340066876753365, + "learning_rate": 0.0001437439015363638, + "loss": 0.8258, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.5395727291816608, + "learning_rate": 0.00014358844184753712, + "loss": 0.9003, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.7074307047526871, + "learning_rate": 0.00014343285199700683, + "loss": 0.9401, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.5764662285713409, + "learning_rate": 0.0001432771324493879, + "loss": 0.7597, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.46830365827528964, + "learning_rate": 0.00014312128366968243, + "loss": 0.6924, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.7437748252665622, + "learning_rate": 0.00014296530612327863, + "loss": 0.9863, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.5646809541546257, + "learning_rate": 0.00014280920027594907, + "loss": 0.8527, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.6377326999288427, + "learning_rate": 0.00014265296659384956, + "loss": 0.9223, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.4754566065020194, + "learning_rate": 0.00014249660554351752, + "loss": 0.7673, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.5111905541845883, + "learning_rate": 0.00014234011759187083, + "loss": 0.8718, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.5313472637050075, + "learning_rate": 0.00014218350320620624, + "loss": 0.8734, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.5894268673196887, + "learning_rate": 0.00014202676285419812, + "loss": 0.9144, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.5459978241336768, + "learning_rate": 0.00014186989700389687, + "loss": 0.7703, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5235275768444314, + "learning_rate": 0.0001417129061237278, + "loss": 0.6861, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.4377967510855603, + "learning_rate": 0.0001415557906824895, + "loss": 0.7768, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.4968442166288454, + "learning_rate": 0.00014139855114935252, + "loss": 0.786, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.5410674523048824, + "learning_rate": 0.00014124118799385796, + "loss": 0.8808, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.612217909281079, + "learning_rate": 0.0001410837016859161, + "loss": 0.926, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.49409111490774654, + "learning_rate": 0.00014092609269580496, + "loss": 0.735, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4153658816647891, + "learning_rate": 0.00014076836149416887, + "loss": 0.7636, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.5318606721582779, + "learning_rate": 0.00014061050855201723, + "loss": 0.8346, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.6498570697531973, + "learning_rate": 0.0001404525343407228, + "loss": 0.9235, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5251596978694136, + "learning_rate": 0.0001402944393320206, + "loss": 0.7657, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.43521208573080616, + "learning_rate": 0.00014013622399800627, + "loss": 0.8307, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.5675765480381267, + "learning_rate": 0.00013997788881113489, + "loss": 0.8893, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4457518470697796, + "learning_rate": 0.00013981943424421932, + "loss": 0.7389, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.6125756999885809, + "learning_rate": 0.0001396608607704289, + "loss": 0.8579, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.495065197072205, + "learning_rate": 0.0001395021688632882, + "loss": 0.8082, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.5063770097020417, + "learning_rate": 0.00013934335899667527, + "loss": 0.8032, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.4226249130395913, + "learning_rate": 0.00013918443164482046, + "loss": 0.7202, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.44610159161640167, + "learning_rate": 0.000139025387282305, + "loss": 0.6598, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.49682259220980884, + "learning_rate": 0.00013886622638405952, + "loss": 0.8139, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.5013280835711356, + "learning_rate": 0.0001387069494253626, + "loss": 0.8026, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.5183690065958357, + "learning_rate": 0.0001385475568818394, + "loss": 0.8727, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4603853052273151, + "learning_rate": 0.00013838804922946027, + "loss": 0.7738, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.6073729194657201, + "learning_rate": 0.00013822842694453924, + "loss": 0.8352, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.6068524072143933, + "learning_rate": 0.0001380686905037327, + "loss": 0.8887, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.5191397985291558, + "learning_rate": 0.00013790884038403795, + "loss": 0.8078, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.5641609132857682, + "learning_rate": 0.00013774887706279165, + "loss": 0.808, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.5389677760221879, + "learning_rate": 0.0001375888010176686, + "loss": 0.8395, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.5184783334009618, + "learning_rate": 0.00013742861272668012, + "loss": 0.8319, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.5459629740792883, + "learning_rate": 0.00013726831266817278, + "loss": 0.8212, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.43376535113344805, + "learning_rate": 0.00013710790132082692, + "loss": 0.8556, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.48247796104572255, + "learning_rate": 0.00013694737916365517, + "loss": 0.8638, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.476546953886047, + "learning_rate": 0.00013678674667600102, + "loss": 0.7701, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.5093857000584507, + "learning_rate": 0.00013662600433753745, + "loss": 0.8421, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.42181155000414133, + "learning_rate": 0.00013646515262826552, + "loss": 0.6899, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.4882886499004825, + "learning_rate": 0.00013630419202851284, + "loss": 0.8336, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 1.0218606193263766, + "learning_rate": 0.00013614312301893223, + "loss": 0.7448, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.42477641436317826, + "learning_rate": 0.0001359819460805001, + "loss": 0.7224, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.5353153320960067, + "learning_rate": 0.00013582066169451535, + "loss": 0.8221, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.5451172345800744, + "learning_rate": 0.0001356592703425976, + "loss": 0.8424, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5181668693402046, + "learning_rate": 0.0001354977725066859, + "loss": 0.7698, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.4766116638861282, + "learning_rate": 0.00013533616866903735, + "loss": 0.7126, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.44770221837631086, + "learning_rate": 0.0001351744593122255, + "loss": 0.6988, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.504750260059945, + "learning_rate": 0.00013501264491913906, + "loss": 0.864, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.6325774483574017, + "learning_rate": 0.00013485072597298038, + "loss": 0.8822, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.4569329524560576, + "learning_rate": 0.00013468870295726398, + "loss": 0.6728, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.5355129531529659, + "learning_rate": 0.0001345265763558152, + "loss": 0.8147, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.4442749326372372, + "learning_rate": 0.00013436434665276865, + "loss": 0.6988, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.45287896753395207, + "learning_rate": 0.00013420201433256689, + "loss": 0.8024, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.49870183332551243, + "learning_rate": 0.00013403957987995882, + "loss": 0.7681, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.4669554709536968, + "learning_rate": 0.00013387704377999842, + "loss": 0.7716, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.48479901942900394, + "learning_rate": 0.00013371440651804313, + "loss": 0.7534, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.5730119832200375, + "learning_rate": 0.0001335516685797525, + "loss": 0.8598, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.5076618420414802, + "learning_rate": 0.00013338883045108674, + "loss": 0.8088, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.7184939384281536, + "learning_rate": 0.00013322589261830517, + "loss": 0.895, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4384030304467163, + "learning_rate": 0.00013306285556796495, + "loss": 0.7392, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.4692553533275476, + "learning_rate": 0.0001328997197869194, + "loss": 0.7433, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.4522213692729189, + "learning_rate": 0.0001327364857623168, + "loss": 0.7129, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.43422551076221394, + "learning_rate": 0.00013257315398159864, + "loss": 0.7158, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.5490870203496083, + "learning_rate": 0.00013240972493249847, + "loss": 0.7931, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.6217571465441969, + "learning_rate": 0.0001322461991030402, + "loss": 0.892, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.633769034792794, + "learning_rate": 0.00013208257698153677, + "loss": 0.86, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.5025293751957768, + "learning_rate": 0.00013191885905658872, + "loss": 0.7794, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.46658759424992413, + "learning_rate": 0.0001317550458170826, + "loss": 0.7948, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.5181717210611255, + "learning_rate": 0.00013159113775218964, + "loss": 0.8416, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.49200410156120783, + "learning_rate": 0.00013142713535136414, + "loss": 0.7819, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.4167994844684718, + "learning_rate": 0.00013126303910434214, + "loss": 0.7104, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4163699666315266, + "learning_rate": 0.00013109884950114007, + "loss": 0.7148, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.4238355139305973, + "learning_rate": 0.00013093456703205288, + "loss": 0.6728, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.511426303580633, + "learning_rate": 0.00013077019218765305, + "loss": 0.839, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.5356515901046077, + "learning_rate": 0.00013060572545878875, + "loss": 0.7557, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.6882185983313841, + "learning_rate": 0.0001304411673365826, + "loss": 0.8474, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.6641428887881485, + "learning_rate": 0.0001302765183124302, + "loss": 0.9442, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.5679268441219842, + "learning_rate": 0.00013011177887799845, + "loss": 0.7947, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.44286158098750944, + "learning_rate": 0.00012994694952522435, + "loss": 0.7316, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.4508969293267315, + "learning_rate": 0.00012978203074631334, + "loss": 0.7638, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5308334648457299, + "learning_rate": 0.00012961702303373795, + "loss": 0.7683, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.5946803376428245, + "learning_rate": 0.00012945192688023624, + "loss": 0.929, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.7207562732813872, + "learning_rate": 0.0001292867427788104, + "loss": 0.9165, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.4779980260077528, + "learning_rate": 0.00012912147122272523, + "loss": 0.7764, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.4855169806953814, + "learning_rate": 0.00012895611270550666, + "loss": 0.7795, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.4662692243441799, + "learning_rate": 0.0001287906677209403, + "loss": 0.7901, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.40931315804008817, + "learning_rate": 0.00012862513676307008, + "loss": 0.7117, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.4854966151390374, + "learning_rate": 0.0001284595203261965, + "loss": 0.7212, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.5357969560679365, + "learning_rate": 0.00012829381890487536, + "loss": 0.7459, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.519103528674126, + "learning_rate": 0.00012812803299391628, + "loss": 0.7753, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.5791316915320031, + "learning_rate": 0.00012796216308838117, + "loss": 0.8571, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.6084051457588875, + "learning_rate": 0.00012779620968358273, + "loss": 0.8512, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4841458991923718, + "learning_rate": 0.00012763017327508305, + "loss": 0.792, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.6251617340588154, + "learning_rate": 0.00012746405435869198, + "loss": 0.8055, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.3840708723496544, + "learning_rate": 0.00012729785343046588, + "loss": 0.7237, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.610610369045068, + "learning_rate": 0.0001271315709867059, + "loss": 0.8078, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.5101123044764597, + "learning_rate": 0.00012696520752395672, + "loss": 0.7977, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.43049751229646416, + "learning_rate": 0.00012679876353900482, + "loss": 0.7101, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.6156941943170096, + "learning_rate": 0.00012663223952887723, + "loss": 0.876, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.49964764586889204, + "learning_rate": 0.00012646563599083996, + "loss": 0.8106, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.6425904840094377, + "learning_rate": 0.00012629895342239643, + "loss": 0.8873, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4594886918342748, + "learning_rate": 0.00012613219232128608, + "loss": 0.7891, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.3715193579073943, + "learning_rate": 0.00012596535318548289, + "loss": 0.6497, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.4969181835624368, + "learning_rate": 0.0001257984365131938, + "loss": 0.8602, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.5112261662756453, + "learning_rate": 0.00012563144280285741, + "loss": 0.7946, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.4404318603249988, + "learning_rate": 0.00012546437255314222, + "loss": 0.8057, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.5074910199048389, + "learning_rate": 0.0001252972262629454, + "loss": 0.7261, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.6579442937626416, + "learning_rate": 0.00012513000443139112, + "loss": 0.7994, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.5269627351371972, + "learning_rate": 0.00012496270755782914, + "loss": 0.8513, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.6410417420546307, + "learning_rate": 0.00012479533614183334, + "loss": 0.8727, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.509013657441772, + "learning_rate": 0.00012462789068320017, + "loss": 0.7401, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.605665680146341, + "learning_rate": 0.00012446037168194714, + "loss": 0.871, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.7658208778704714, + "learning_rate": 0.00012429277963831148, + "loss": 0.878, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.5077593808819223, + "learning_rate": 0.00012412511505274844, + "loss": 0.7604, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.6091180723940991, + "learning_rate": 0.00012395737842592995, + "loss": 0.9141, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.5647979519746409, + "learning_rate": 0.000123789570258743, + "loss": 0.8112, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.49231303109397256, + "learning_rate": 0.00012362169105228826, + "loss": 0.838, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.507668649681542, + "learning_rate": 0.00012345374130787854, + "loss": 0.8796, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.5270163965454614, + "learning_rate": 0.00012328572152703725, + "loss": 0.7963, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.4607488335162496, + "learning_rate": 0.000123117632211497, + "loss": 0.7155, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.5625901485250786, + "learning_rate": 0.00012294947386319794, + "loss": 0.77, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.45832785535563436, + "learning_rate": 0.0001227812469842864, + "loss": 0.7626, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.45676392241459746, + "learning_rate": 0.00012261295207711346, + "loss": 0.7891, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.5331910631819192, + "learning_rate": 0.00012244458964423327, + "loss": 0.8101, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.5336665803865309, + "learning_rate": 0.00012227616018840154, + "loss": 0.8476, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4725071119751479, + "learning_rate": 0.0001221076642125742, + "loss": 0.7257, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.5485778876123043, + "learning_rate": 0.00012193910221990581, + "loss": 0.7574, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.49722835793082243, + "learning_rate": 0.00012177047471374807, + "loss": 0.8108, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.47025306102710457, + "learning_rate": 0.00012160178219764837, + "loss": 0.8576, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.5428983779819186, + "learning_rate": 0.0001214330251753481, + "loss": 0.8883, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.4694803567020228, + "learning_rate": 0.00012126420415078132, + "loss": 0.7932, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.48473876763436824, + "learning_rate": 0.00012109531962807332, + "loss": 0.8554, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.47005570048715595, + "learning_rate": 0.00012092637211153885, + "loss": 0.7806, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.5006290971264497, + "learning_rate": 0.0001207573621056809, + "loss": 0.7757, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.6262326768214996, + "learning_rate": 0.00012058829011518896, + "loss": 0.8247, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.6495934091606899, + "learning_rate": 0.00012041915664493761, + "loss": 0.9522, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.3926168857682602, + "learning_rate": 0.00012024996219998517, + "loss": 0.7462, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.45491320496713733, + "learning_rate": 0.00012008070728557186, + "loss": 0.7765, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.4434547347861371, + "learning_rate": 0.00011991139240711857, + "loss": 0.766, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.5861265159507776, + "learning_rate": 0.00011974201807022525, + "loss": 0.8013, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4550910918588998, + "learning_rate": 0.00011957258478066931, + "loss": 0.7629, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.4797524978868209, + "learning_rate": 0.00011940309304440433, + "loss": 0.778, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.5258462973897162, + "learning_rate": 0.00011923354336755835, + "loss": 0.8173, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.5479945980446406, + "learning_rate": 0.00011906393625643244, + "loss": 0.8253, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.4568344411849772, + "learning_rate": 0.00011889427221749916, + "loss": 0.7206, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.505548947892483, + "learning_rate": 0.00011872455175740112, + "loss": 0.7785, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.48924710640761454, + "learning_rate": 0.00011855477538294935, + "loss": 0.7868, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.4107100163640565, + "learning_rate": 0.00011838494360112185, + "loss": 0.7318, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.5565731120612221, + "learning_rate": 0.00011821505691906216, + "loss": 0.8572, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4982784682798707, + "learning_rate": 0.00011804511584407763, + "loss": 0.769, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.4995791829692604, + "learning_rate": 0.00011787512088363817, + "loss": 0.7514, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.45805779859294643, + "learning_rate": 0.00011770507254537453, + "loss": 0.7062, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.509454093532379, + "learning_rate": 0.00011753497133707679, + "loss": 0.8522, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.42024159280118345, + "learning_rate": 0.00011736481776669306, + "loss": 0.7247, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.39307172555639547, + "learning_rate": 0.00011719461234232764, + "loss": 0.7005, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.42169313512842804, + "learning_rate": 0.00011702435557223987, + "loss": 0.7528, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.5039975584791423, + "learning_rate": 0.00011685404796484225, + "loss": 0.7759, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.40711579520247354, + "learning_rate": 0.00011668369002869912, + "loss": 0.6914, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.6028022279161257, + "learning_rate": 0.00011651328227252517, + "loss": 0.9179, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.526392815479632, + "learning_rate": 0.00011634282520518383, + "loss": 0.8378, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.5248041781489471, + "learning_rate": 0.00011617231933568578, + "loss": 0.8562, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.42751189208185303, + "learning_rate": 0.00011600176517318741, + "loss": 0.7296, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.512742744328725, + "learning_rate": 0.00011583116322698935, + "loss": 0.7847, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.5611307796660479, + "learning_rate": 0.00011566051400653486, + "loss": 0.8123, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.5575651974777656, + "learning_rate": 0.00011548981802140848, + "loss": 0.7401, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.4723621023001122, + "learning_rate": 0.00011531907578133429, + "loss": 0.7066, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.4575807168925933, + "learning_rate": 0.00011514828779617459, + "loss": 0.7333, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4651231651925861, + "learning_rate": 0.00011497745457592816, + "loss": 0.7569, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.3966971100695475, + "learning_rate": 0.00011480657663072896, + "loss": 0.6847, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.5161805818763336, + "learning_rate": 0.00011463565447084445, + "loss": 0.7304, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.5663476833765066, + "learning_rate": 0.00011446468860667421, + "loss": 0.876, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.46355268174699166, + "learning_rate": 0.00011429367954874819, + "loss": 0.7611, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.5092064946900178, + "learning_rate": 0.0001141226278077254, + "loss": 0.7269, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.46970918456239436, + "learning_rate": 0.00011395153389439233, + "loss": 0.7396, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.4557188143082749, + "learning_rate": 0.00011378039831966134, + "loss": 0.7341, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.6647036642923956, + "learning_rate": 0.00011360922159456928, + "loss": 0.7488, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6765332341145955, + "learning_rate": 0.00011343800423027582, + "loss": 0.9396, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.5175862474749728, + "learning_rate": 0.00011326674673806195, + "loss": 0.7783, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.436284207863048, + "learning_rate": 0.00011309544962932862, + "loss": 0.7639, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4686215143619508, + "learning_rate": 0.0001129241134155949, + "loss": 0.7497, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.44740547705866324, + "learning_rate": 0.00011275273860849684, + "loss": 0.6466, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.5501474923213852, + "learning_rate": 0.00011258132571978555, + "loss": 0.8357, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4729206846885701, + "learning_rate": 0.00011240987526132594, + "loss": 0.7393, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.6172459831714341, + "learning_rate": 0.00011223838774509514, + "loss": 0.8347, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.47854416285741824, + "learning_rate": 0.00011206686368318086, + "loss": 0.7491, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4388952320442764, + "learning_rate": 0.00011189530358778005, + "loss": 0.7897, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.5456853460509431, + "learning_rate": 0.00011172370797119712, + "loss": 0.8387, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.48268009519768473, + "learning_rate": 0.00011155207734584263, + "loss": 0.8265, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.43680690221110663, + "learning_rate": 0.00011138041222423177, + "loss": 0.72, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.49961382847292923, + "learning_rate": 0.00011120871311898254, + "loss": 0.7646, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.5450142519081406, + "learning_rate": 0.0001110369805428146, + "loss": 0.8989, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.622389521380628, + "learning_rate": 0.00011086521500854745, + "loss": 0.8413, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.47369590455789173, + "learning_rate": 0.0001106934170290991, + "loss": 0.813, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.46356194145737945, + "learning_rate": 0.00011052158711748434, + "loss": 0.7059, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.44236433205679154, + "learning_rate": 0.00011034972578681338, + "loss": 0.8015, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.8646622375431554, + "learning_rate": 0.00011017783355029026, + "loss": 1.0327, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.42402134760906085, + "learning_rate": 0.00011000591092121127, + "loss": 0.7366, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.48443938796841723, + "learning_rate": 0.00010983395841296348, + "loss": 0.7556, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.46558447549359927, + "learning_rate": 0.0001096619765390232, + "loss": 0.783, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.5132249584302769, + "learning_rate": 0.00010948996581295436, + "loss": 0.6438, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5177124844667969, + "learning_rate": 0.00010931792674840718, + "loss": 0.8248, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.43916411940111544, + "learning_rate": 0.00010914585985911632, + "loss": 0.7437, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.6446654464232441, + "learning_rate": 0.00010897376565889971, + "loss": 0.7377, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.5476434595463063, + "learning_rate": 0.00010880164466165674, + "loss": 0.8251, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.5544217543377928, + "learning_rate": 0.00010862949738136681, + "loss": 0.828, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.4863232958573473, + "learning_rate": 0.00010845732433208779, + "loss": 0.7145, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5264052654515747, + "learning_rate": 0.00010828512602795462, + "loss": 0.9068, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.405147594001005, + "learning_rate": 0.00010811290298317755, + "loss": 0.7008, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.5847775056895158, + "learning_rate": 0.00010794065571204072, + "loss": 0.8319, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.3871865423396232, + "learning_rate": 0.00010776838472890065, + "loss": 0.7312, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.6457922270370335, + "learning_rate": 0.00010759609054818458, + "loss": 0.8671, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.7058579473633974, + "learning_rate": 0.00010742377368438914, + "loss": 0.8271, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.6236517778840219, + "learning_rate": 0.00010725143465207867, + "loss": 0.856, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.5573009691516873, + "learning_rate": 0.00010707907396588361, + "loss": 0.7543, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.49780016357808454, + "learning_rate": 0.0001069066921404992, + "loss": 0.7296, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5077651372406793, + "learning_rate": 0.00010673428969068364, + "loss": 0.7598, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.5880793316898241, + "learning_rate": 0.00010656186713125689, + "loss": 0.8521, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.4501858915478432, + "learning_rate": 0.0001063894249770989, + "loss": 0.7065, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.4525662342544168, + "learning_rate": 0.00010621696374314807, + "loss": 0.7283, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.46356536316788016, + "learning_rate": 0.00010604448394439983, + "loss": 0.711, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.518028225576915, + "learning_rate": 0.00010587198609590505, + "loss": 0.6937, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.49351490979787055, + "learning_rate": 0.00010569947071276847, + "loss": 0.7069, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.5228500242330347, + "learning_rate": 0.00010552693831014726, + "loss": 0.7455, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.5719893983924824, + "learning_rate": 0.0001053543894032493, + "loss": 0.9416, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5860034231810596, + "learning_rate": 0.00010518182450733186, + "loss": 0.8477, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.4261071701248609, + "learning_rate": 0.00010500924413769988, + "loss": 0.7264, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.46650246057627515, + "learning_rate": 0.00010483664880970457, + "loss": 0.7839, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4904731107816936, + "learning_rate": 0.00010466403903874176, + "loss": 0.7921, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.4106913663496465, + "learning_rate": 0.00010449141534025045, + "loss": 0.7229, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.4630754228578689, + "learning_rate": 0.00010431877822971117, + "loss": 0.8031, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.669350177875791, + "learning_rate": 0.00010414612822264455, + "loss": 0.8647, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.550665101899265, + "learning_rate": 0.00010397346583460971, + "loss": 0.7562, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.5337377164529131, + "learning_rate": 0.0001038007915812028, + "loss": 0.8166, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.5369097933800977, + "learning_rate": 0.00010362810597805526, + "loss": 0.8819, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.5726516016893167, + "learning_rate": 0.0001034554095408326, + "loss": 0.7766, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.5174007996748117, + "learning_rate": 0.00010328270278523256, + "loss": 0.7886, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.5108075215013611, + "learning_rate": 0.0001031099862269837, + "loss": 0.7897, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.4952947355198917, + "learning_rate": 0.00010293726038184393, + "loss": 0.7651, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.3929843911174011, + "learning_rate": 0.00010276452576559879, + "loss": 0.6889, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.5123541093374682, + "learning_rate": 0.00010259178289406011, + "loss": 0.7496, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.7818822625689277, + "learning_rate": 0.00010241903228306431, + "loss": 0.8296, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.5186603963496055, + "learning_rate": 0.0001022462744484709, + "loss": 0.7966, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.48888205408805113, + "learning_rate": 0.00010207350990616107, + "loss": 0.7337, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.4230633658945411, + "learning_rate": 0.00010190073917203589, + "loss": 0.7062, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.5203464661211605, + "learning_rate": 0.00010172796276201503, + "loss": 0.7566, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.48279774075271586, + "learning_rate": 0.0001015551811920351, + "loss": 0.8452, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.45710951243634307, + "learning_rate": 0.00010138239497804804, + "loss": 0.6955, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.5619796545503565, + "learning_rate": 0.00010120960463601976, + "loss": 0.7901, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.48951536650972494, + "learning_rate": 0.00010103681068192845, + "loss": 0.7348, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.47600503644883513, + "learning_rate": 0.00010086401363176305, + "loss": 0.7258, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.4847148591163568, + "learning_rate": 0.00010069121400152181, + "loss": 0.7551, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.5372752457232837, + "learning_rate": 0.00010051841230721065, + "loss": 0.8417, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.5872679932756703, + "learning_rate": 0.0001003456090648416, + "loss": 0.8513, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.49083770690518475, + "learning_rate": 0.00010017280479043147, + "loss": 0.7121, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4659127202746154, + "learning_rate": 0.0001, + "loss": 0.755, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.48699175107640585, + "learning_rate": 9.982719520956855e-05, + "loss": 0.8061, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.45843027668721104, + "learning_rate": 9.965439093515841e-05, + "loss": 0.6362, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.5333429546353413, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7565, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.43853763581027966, + "learning_rate": 9.930878599847821e-05, + "loss": 0.754, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.5266600942678731, + "learning_rate": 9.913598636823693e-05, + "loss": 0.7966, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.42561735307159076, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7163, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.449037557482995, + "learning_rate": 9.879039536398024e-05, + "loss": 0.7291, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.42693571312985196, + "learning_rate": 9.861760502195197e-05, + "loss": 0.7571, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.6057037416060455, + "learning_rate": 9.844481880796491e-05, + "loss": 0.8379, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.38959454129722154, + "learning_rate": 9.827203723798498e-05, + "loss": 0.7104, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.4412624315931409, + "learning_rate": 9.809926082796415e-05, + "loss": 0.7196, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6146574697091995, + "learning_rate": 9.792649009383899e-05, + "loss": 0.8773, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.4521603647619686, + "learning_rate": 9.775372555152912e-05, + "loss": 0.7239, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.6000515937257633, + "learning_rate": 9.758096771693573e-05, + "loss": 0.9331, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.48263637707963297, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7464, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.5369406336442489, + "learning_rate": 9.723547423440122e-05, + "loss": 0.8192, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.5734133799045289, + "learning_rate": 9.70627396181561e-05, + "loss": 0.8167, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.5157323536339183, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7275, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.4320454187671807, + "learning_rate": 9.671729721476746e-05, + "loss": 0.6858, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.4362601796684237, + "learning_rate": 9.654459045916743e-05, + "loss": 0.7336, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.43655010339443623, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7057, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.565766930269078, + "learning_rate": 9.619920841879725e-05, + "loss": 0.8337, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.4846368722573889, + "learning_rate": 9.602653416539031e-05, + "loss": 0.7062, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.48713923130411746, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7796, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.6359416804035497, + "learning_rate": 9.568122177028884e-05, + "loss": 0.8497, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.5866104931830324, + "learning_rate": 9.550858465974958e-05, + "loss": 0.8544, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.6620979616144008, + "learning_rate": 9.533596096125825e-05, + "loss": 0.9016, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.5382906566763935, + "learning_rate": 9.516335119029546e-05, + "loss": 0.8, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.5147446979701066, + "learning_rate": 9.499075586230013e-05, + "loss": 0.8645, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4869675410544784, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7537, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.5085385693138902, + "learning_rate": 9.464561059675073e-05, + "loss": 0.7488, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.466320279935278, + "learning_rate": 9.44730616898528e-05, + "loss": 0.7034, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.6619554748109024, + "learning_rate": 9.430052928723153e-05, + "loss": 0.9005, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.42704129186922574, + "learning_rate": 9.412801390409497e-05, + "loss": 0.6908, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.6515567080007995, + "learning_rate": 9.395551605560018e-05, + "loss": 0.8798, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.639067034187619, + "learning_rate": 9.378303625685195e-05, + "loss": 0.8557, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.46267469343545414, + "learning_rate": 9.361057502290113e-05, + "loss": 0.7318, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.44148492385870847, + "learning_rate": 9.343813286874312e-05, + "loss": 0.7608, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.42488598217866247, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6942, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.5400187523020266, + "learning_rate": 9.309330785950086e-05, + "loss": 0.8963, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.4033778484114782, + "learning_rate": 9.292092603411641e-05, + "loss": 0.5889, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.6648280630297925, + "learning_rate": 9.274856534792138e-05, + "loss": 0.8801, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.4158160219681407, + "learning_rate": 9.257622631561085e-05, + "loss": 0.7063, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.4340034267388822, + "learning_rate": 9.240390945181543e-05, + "loss": 0.6817, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.6060790801817617, + "learning_rate": 9.223161527109937e-05, + "loss": 0.8711, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.49214997795697046, + "learning_rate": 9.205934428795929e-05, + "loss": 0.7479, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.4554214767887092, + "learning_rate": 9.188709701682247e-05, + "loss": 0.7654, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4834491284888872, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7904, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.5120511712619266, + "learning_rate": 9.154267566791223e-05, + "loss": 0.7684, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.6515054587877325, + "learning_rate": 9.137050261863324e-05, + "loss": 0.8495, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.49541283087197946, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7174, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.5203766008620797, + "learning_rate": 9.102623434110028e-05, + "loss": 0.7405, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.5692575546824659, + "learning_rate": 9.085414014088369e-05, + "loss": 0.6862, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.47628058661436834, + "learning_rate": 9.068207325159284e-05, + "loss": 0.8073, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.4742278003831475, + "learning_rate": 9.051003418704565e-05, + "loss": 0.7852, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.5541467054786648, + "learning_rate": 9.033802346097682e-05, + "loss": 0.807, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.48691385545687804, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7268, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.46472962020380765, + "learning_rate": 8.999408907878877e-05, + "loss": 0.6764, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.5500366067206548, + "learning_rate": 8.982216644970979e-05, + "loss": 0.7323, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5108050694391268, + "learning_rate": 8.965027421318665e-05, + "loss": 0.8426, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.604186928096418, + "learning_rate": 8.947841288251568e-05, + "loss": 0.8281, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.4795065894056041, + "learning_rate": 8.930658297090091e-05, + "loss": 0.769, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.6189202287040847, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7709, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.5380527668118427, + "learning_rate": 8.896301945718541e-05, + "loss": 0.7603, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.4115800576976164, + "learning_rate": 8.879128688101749e-05, + "loss": 0.7383, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4693879302110707, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7566, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.6167483217002174, + "learning_rate": 8.844792265415738e-05, + "loss": 0.8895, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.5125593007986087, + "learning_rate": 8.827629202880293e-05, + "loss": 0.748, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.5070721336715346, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6403, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.6555424217817568, + "learning_rate": 8.793313631681915e-05, + "loss": 0.8753, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.4700363265708347, + "learning_rate": 8.776161225490489e-05, + "loss": 0.7608, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.45530246840219085, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7679, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.5769680891344217, + "learning_rate": 8.741867428021446e-05, + "loss": 0.9385, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.5213522658748875, + "learning_rate": 8.724726139150318e-05, + "loss": 0.7074, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.5430504538099926, + "learning_rate": 8.707588658440511e-05, + "loss": 0.8272, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.5711565429390287, + "learning_rate": 8.690455037067141e-05, + "loss": 0.8051, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.5134932074468219, + "learning_rate": 8.673325326193806e-05, + "loss": 0.8268, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5285646384272197, + "learning_rate": 8.656199576972423e-05, + "loss": 0.8118, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.6025455434008704, + "learning_rate": 8.639077840543077e-05, + "loss": 0.7901, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.4612698316993332, + "learning_rate": 8.621960168033867e-05, + "loss": 0.7982, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.5369786917678391, + "learning_rate": 8.604846610560771e-05, + "loss": 0.8198, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.47531988042191753, + "learning_rate": 8.587737219227462e-05, + "loss": 0.7494, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.4473668127784352, + "learning_rate": 8.570632045125185e-05, + "loss": 0.6847, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.6211559575041504, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7318, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.43328372044700475, + "learning_rate": 8.536434552915556e-05, + "loss": 0.6811, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.47205587916228653, + "learning_rate": 8.519342336927105e-05, + "loss": 0.8094, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.4694639479325875, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7448, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.47735306311845915, + "learning_rate": 8.485171220382545e-05, + "loss": 0.7255, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.4744457136560929, + "learning_rate": 8.468092421866573e-05, + "loss": 0.7868, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4500414979056848, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7213, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.5390310724367505, + "learning_rate": 8.433948599346516e-05, + "loss": 0.8223, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.4778521611194694, + "learning_rate": 8.416883677301069e-05, + "loss": 0.8438, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.640710731690861, + "learning_rate": 8.399823482681262e-05, + "loss": 0.9087, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.5073079590848001, + "learning_rate": 8.382768066431425e-05, + "loss": 0.8257, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.5713390277685219, + "learning_rate": 8.36571747948162e-05, + "loss": 0.7989, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4996593533508955, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7331, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.4621108429117102, + "learning_rate": 8.33163099713009e-05, + "loss": 0.7546, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.6679531151631573, + "learning_rate": 8.31459520351578e-05, + "loss": 0.8472, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.6668175327591062, + "learning_rate": 8.297564442776014e-05, + "loss": 0.8815, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.5663129656097874, + "learning_rate": 8.280538765767235e-05, + "loss": 0.7657, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.5685556021286235, + "learning_rate": 8.263518223330697e-05, + "loss": 0.8798, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.42441445817147244, + "learning_rate": 8.246502866292324e-05, + "loss": 0.655, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.5685652190537315, + "learning_rate": 8.22949274546255e-05, + "loss": 0.8485, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.5624917087659866, + "learning_rate": 8.212487911636184e-05, + "loss": 0.8148, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3846401553503173, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6862, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.5743668213994452, + "learning_rate": 8.178494308093789e-05, + "loss": 0.8873, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.6479373917403084, + "learning_rate": 8.161505639887817e-05, + "loss": 0.8719, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.40451274184862296, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6928, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.48464990730039686, + "learning_rate": 8.127544824259889e-05, + "loss": 0.7871, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.4265284370295491, + "learning_rate": 8.110572778250085e-05, + "loss": 0.6842, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.4531712771929814, + "learning_rate": 8.093606374356759e-05, + "loss": 0.8089, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.5282833160589813, + "learning_rate": 8.076645663244168e-05, + "loss": 0.8612, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.4966712268976588, + "learning_rate": 8.059690695559568e-05, + "loss": 0.8598, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.5036087290902579, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7659, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.5469001239551731, + "learning_rate": 8.025798192977481e-05, + "loss": 0.5943, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.4463582397100332, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6806, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4482926143147082, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7409, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.5499106714015792, + "learning_rate": 7.975003780001485e-05, + "loss": 0.818, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.4929794102658157, + "learning_rate": 7.958084335506239e-05, + "loss": 0.7785, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5349856013157154, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6787, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.45238853987498595, + "learning_rate": 7.924263789431912e-05, + "loss": 0.7539, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.4237508155111719, + "learning_rate": 7.907362788846116e-05, + "loss": 0.7112, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.45808964928763096, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7771, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.4091449714829364, + "learning_rate": 7.873579584921869e-05, + "loss": 0.6485, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.4889806760975611, + "learning_rate": 7.856697482465196e-05, + "loss": 0.6759, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5508245245182095, + "learning_rate": 7.839821780235168e-05, + "loss": 0.8022, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.5898191229656782, + "learning_rate": 7.822952528625191e-05, + "loss": 0.8398, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.47826214094023284, + "learning_rate": 7.806089778009421e-05, + "loss": 0.697, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.5681861178160683, + "learning_rate": 7.789233578742582e-05, + "loss": 0.9488, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.5392993969331693, + "learning_rate": 7.772383981159849e-05, + "loss": 0.8002, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.4913637580180928, + "learning_rate": 7.755541035576677e-05, + "loss": 0.7783, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.504611556710071, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7271, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.46748778682185405, + "learning_rate": 7.721875301571359e-05, + "loss": 0.7513, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.45296519137301017, + "learning_rate": 7.705052613680211e-05, + "loss": 0.7341, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.49348300837413195, + "learning_rate": 7.688236778850306e-05, + "loss": 0.8698, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.5032313813731415, + "learning_rate": 7.671427847296275e-05, + "loss": 0.7695, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.46451439857227805, + "learning_rate": 7.654625869212146e-05, + "loss": 0.8341, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.48209882298040035, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7778, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.5516104800786016, + "learning_rate": 7.6210429741257e-05, + "loss": 0.7292, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.43801049561849525, + "learning_rate": 7.604262157407007e-05, + "loss": 0.7435, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.5079927430133366, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7599, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.4763049318851867, + "learning_rate": 7.570722036168854e-05, + "loss": 0.6854, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.7403263673714452, + "learning_rate": 7.55396283180529e-05, + "loss": 0.8395, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.4813353695503427, + "learning_rate": 7.537210931679987e-05, + "loss": 0.8065, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.5446426690578455, + "learning_rate": 7.520466385816671e-05, + "loss": 0.7714, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.4326182537228563, + "learning_rate": 7.503729244217086e-05, + "loss": 0.6318, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.4584204156957829, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7055, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.6282796044032658, + "learning_rate": 7.470277373705461e-05, + "loss": 0.8419, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.5543211445921447, + "learning_rate": 7.453562744685778e-05, + "loss": 0.7328, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.655151151291856, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7464, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.5380245745320105, + "learning_rate": 7.42015634868062e-05, + "loss": 0.753, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.5265961461267854, + "learning_rate": 7.403464681451715e-05, + "loss": 0.823, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.6381331403841647, + "learning_rate": 7.386780767871397e-05, + "loss": 0.9757, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.4447661712160991, + "learning_rate": 7.370104657760361e-05, + "loss": 0.6729, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.4045124027471263, + "learning_rate": 7.353436400916004e-05, + "loss": 0.6791, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.44717160795570676, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7998, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.4844227400492327, + "learning_rate": 7.320123646099519e-05, + "loss": 0.784, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.5960917780879336, + "learning_rate": 7.303479247604332e-05, + "loss": 0.8309, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.4941955447021566, + "learning_rate": 7.286842901329412e-05, + "loss": 0.777, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.4671897428092522, + "learning_rate": 7.270214656953415e-05, + "loss": 0.7454, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.5189767174413965, + "learning_rate": 7.253594564130804e-05, + "loss": 0.8536, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4704982863123676, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7822, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.5406238052780499, + "learning_rate": 7.22037903164173e-05, + "loss": 0.7489, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.6020386246541988, + "learning_rate": 7.203783691161883e-05, + "loss": 0.8678, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.45764114524716276, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7431, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.5067908160648289, + "learning_rate": 7.170618109512465e-05, + "loss": 0.7513, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.5348274612779799, + "learning_rate": 7.154047967380354e-05, + "loss": 0.8676, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.47661862978938707, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7565, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.43205702010047076, + "learning_rate": 7.12093322790597e-05, + "loss": 0.6713, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.5378722416279199, + "learning_rate": 7.104388729449338e-05, + "loss": 0.8349, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3895921853008002, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6663, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.5082806574142494, + "learning_rate": 7.071325722118963e-05, + "loss": 0.7544, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.5650201467097065, + "learning_rate": 7.054807311976379e-05, + "loss": 0.7768, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.4096872443532017, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7259, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.5379896796802529, + "learning_rate": 7.021796925368667e-05, + "loss": 0.8902, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.46756979598603726, + "learning_rate": 7.005305047477566e-05, + "loss": 0.766, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.48308377185190693, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6935, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.46655363286281903, + "learning_rate": 6.972348168756983e-05, + "loss": 0.7312, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.6958980881773419, + "learning_rate": 6.955883266341741e-05, + "loss": 0.8651, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5290377094155534, + "learning_rate": 6.939427454121128e-05, + "loss": 0.691, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.518133987643529, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7471, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.5157380752691214, + "learning_rate": 6.906543296794714e-05, + "loss": 0.8151, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.6494412370042536, + "learning_rate": 6.890115049885994e-05, + "loss": 0.8192, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.43205634335668175, + "learning_rate": 6.873696089565786e-05, + "loss": 0.7397, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.4554541305344579, + "learning_rate": 6.85728646486359e-05, + "loss": 0.7428, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4493850061507572, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6925, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.5101316029528573, + "learning_rate": 6.82449541829174e-05, + "loss": 0.7068, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.6696214707884905, + "learning_rate": 6.80811409434113e-05, + "loss": 0.928, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.5144302677407107, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7072, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.4802014965966708, + "learning_rate": 6.775380089695986e-05, + "loss": 0.7351, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.6233943714464858, + "learning_rate": 6.759027506750158e-05, + "loss": 0.7396, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.5902832195659008, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7455, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.5148558440489647, + "learning_rate": 6.726351423768322e-05, + "loss": 0.8222, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.5422651160322576, + "learning_rate": 6.710028021308061e-05, + "loss": 0.7444, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.5997442019912321, + "learning_rate": 6.693714443203507e-05, + "loss": 0.8776, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.47216962959753883, + "learning_rate": 6.677410738169485e-05, + "loss": 0.8149, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.48578142733428253, + "learning_rate": 6.661116954891328e-05, + "loss": 0.7723, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.43694832545933093, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7322, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.4387965685892197, + "learning_rate": 6.62855934819569e-05, + "loss": 0.7192, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.44221737929282645, + "learning_rate": 6.612295622000162e-05, + "loss": 0.6721, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.5317043483069767, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7802, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.5254488331545611, + "learning_rate": 6.579798566743314e-05, + "loss": 0.854, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.4320448488412785, + "learning_rate": 6.563565334723134e-05, + "loss": 0.7555, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.6693722250597276, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7934, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.561061711091939, + "learning_rate": 6.531129704273604e-05, + "loss": 0.7534, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.42815001849208884, + "learning_rate": 6.514927402701964e-05, + "loss": 0.68, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.6616699072813104, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7953, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.607465993246323, + "learning_rate": 6.48255406877745e-05, + "loss": 0.8411, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.4634088038952, + "learning_rate": 6.466383133096267e-05, + "loss": 0.7263, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4663127644283584, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7019, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.5318269218021549, + "learning_rate": 6.434072965740242e-05, + "loss": 0.8198, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.48418239365215526, + "learning_rate": 6.417933830548467e-05, + "loss": 0.788, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4934059389962716, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7581, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.45641908593579644, + "learning_rate": 6.385687698106781e-05, + "loss": 0.7607, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.40804816282301337, + "learning_rate": 6.369580797148718e-05, + "loss": 0.7346, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.41541357433087245, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6507, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.530273367861406, + "learning_rate": 6.337399566246257e-05, + "loss": 0.7793, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.45198700938716535, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6777, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.4910311261306767, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7497, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.4191032950549475, + "learning_rate": 6.289209867917312e-05, + "loss": 0.6815, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.42521602565259287, + "learning_rate": 6.273168733182722e-05, + "loss": 0.6696, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.45724365836303976, + "learning_rate": 6.25713872733199e-05, + "loss": 0.693, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.4681549116148737, + "learning_rate": 6.241119898233144e-05, + "loss": 0.7026, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.6789018870198453, + "learning_rate": 6.225112293720836e-05, + "loss": 0.7361, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.5063196359817714, + "learning_rate": 6.209115961596208e-05, + "loss": 0.759, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.770884473818953, + "learning_rate": 6.19313094962673e-05, + "loss": 0.6958, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.6074969474432725, + "learning_rate": 6.177157305546078e-05, + "loss": 0.8502, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.5011061449472839, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7515, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.5350931694868268, + "learning_rate": 6.145244311816063e-05, + "loss": 0.7487, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.6716992073597984, + "learning_rate": 6.129305057463741e-05, + "loss": 0.8926, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.524797879941467, + "learning_rate": 6.113377361594049e-05, + "loss": 0.8274, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.4809902269491656, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6717, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.4219959336351506, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.7069, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.4657782597005101, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7688, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.568815942322247, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.811, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.5377443693598044, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.8183, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.45838809987067625, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7749, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.5101782962781369, + "learning_rate": 6.002211118886514e-05, + "loss": 0.8099, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.48877370124847747, + "learning_rate": 5.986377600199371e-05, + "loss": 0.643, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.46387351043563374, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6959, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.39213100681003926, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.7196, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.42281381656254524, + "learning_rate": 5.938949144798279e-05, + "loss": 0.6999, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.5186431685086226, + "learning_rate": 5.923163850583113e-05, + "loss": 0.8203, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.4485652182183885, + "learning_rate": 5.907390730419507e-05, + "loss": 0.686, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.5572375127517957, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.7907, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.40511949815619563, + "learning_rate": 5.875881200614207e-05, + "loss": 0.664, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.482497695987161, + "learning_rate": 5.860144885064751e-05, + "loss": 0.7242, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.5357150658223143, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.7433, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.5014769543067805, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7665, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.5921082136434273, + "learning_rate": 5.813010299610313e-05, + "loss": 0.7779, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.48359932766996533, + "learning_rate": 5.797323714580192e-05, + "loss": 0.746, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4953142138271574, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7494, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.43792013269202484, + "learning_rate": 5.765988240812921e-05, + "loss": 0.6628, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.5439812674482977, + "learning_rate": 5.750339445648252e-05, + "loss": 0.7148, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.5922963850903797, + "learning_rate": 5.73470334061505e-05, + "loss": 0.8375, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.5806771674779112, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.8751, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.4701483806067986, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.764, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4889909786916612, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7088, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.4613240962353329, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.7149, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.3984948217464435, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.7382, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.501250894300205, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7694, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.5822686939948196, + "learning_rate": 5.625609846363622e-05, + "loss": 0.8485, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.43881242398728537, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.7095, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.4512436169356323, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.792, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.43294726731770017, + "learning_rate": 5.579050500768836e-05, + "loss": 0.6956, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.5981157102451093, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.7722, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.5866986489901131, + "learning_rate": 5.54807686792933e-05, + "loss": 0.8363, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.4785783401013728, + "learning_rate": 5.53260996957381e-05, + "loss": 0.7856, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.5080859880685213, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.7517, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.46964094461012745, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6938, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.672978787343272, + "learning_rate": 5.486289500882355e-05, + "loss": 0.8052, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.5027873812480866, + "learning_rate": 5.47087624046575e-05, + "loss": 0.7143, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.5810707920425806, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7721, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.44439083231981336, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.6527, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.42210824519504037, + "learning_rate": 5.424717791025302e-05, + "loss": 0.7518, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4714095795814111, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7473, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.504061296510171, + "learning_rate": 5.394013727258254e-05, + "loss": 0.6969, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.46041365032163317, + "learning_rate": 5.378682303724435e-05, + "loss": 0.6413, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.4969594868488173, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7814, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.49584945662167007, + "learning_rate": 5.348060902265871e-05, + "loss": 0.7614, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.5274520652139624, + "learning_rate": 5.332771015781275e-05, + "loss": 0.771, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.5332755545886234, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7514, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.49100379041992465, + "learning_rate": 5.302233099590928e-05, + "loss": 0.7213, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5938538866312052, + "learning_rate": 5.286985161076029e-05, + "loss": 0.7977, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4942488328210268, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6769, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.4215994160402016, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.5855, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.5086154225201112, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6753, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.5248714135836858, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6745, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.5620922527146265, + "learning_rate": 5.210957484346314e-05, + "loss": 0.7977, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.4182022770681136, + "learning_rate": 5.195794670011776e-05, + "loss": 0.6789, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.42913433058351197, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6787, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.6440569118119236, + "learning_rate": 5.165512124837344e-05, + "loss": 0.8633, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.49867787813607417, + "learning_rate": 5.150392484425728e-05, + "loss": 0.6548, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.506007457551456, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7303, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.4625411722735148, + "learning_rate": 5.120196693701267e-05, + "loss": 0.6942, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.5030217459622003, + "learning_rate": 5.105120633557634e-05, + "loss": 0.6883, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.46391289917365475, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7552, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.4119788599648552, + "learning_rate": 5.075012408804458e-05, + "loss": 0.6306, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.4399745670376556, + "learning_rate": 5.059980334102637e-05, + "loss": 0.6398, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.5260393823344774, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.8704, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.5601385184124167, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.8091, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.5569550119402903, + "learning_rate": 5.014972799220403e-05, + "loss": 0.783, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.6694025213672582, + "learning_rate": 5.000000000000002e-05, + "loss": 0.8641, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.48258198891676685, + "learning_rate": 4.985042131538545e-05, + "loss": 0.7629, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.43424476605807916, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.6529, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.556758950681185, + "learning_rate": 4.955171365513603e-05, + "loss": 0.8336, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.46801204945419045, + "learning_rate": 4.940258557148765e-05, + "loss": 0.7457, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.5170130104399749, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.7409, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.4723598024244657, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6647, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.5073743528033355, + "learning_rate": 4.895610964891923e-05, + "loss": 0.731, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.5333546065197741, + "learning_rate": 4.880758859890536e-05, + "loss": 0.7705, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5430941842482363, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7337, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.4521245278658801, + "learning_rate": 4.851100554686021e-05, + "loss": 0.7228, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.5579528039858371, + "learning_rate": 4.836294443047088e-05, + "loss": 0.6376, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4378522151415932, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6456, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.47299190909586863, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.7334, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.45553444421775985, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6508, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.6104515238781205, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7464, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.4312168444174294, + "learning_rate": 4.762496061632814e-05, + "loss": 0.7016, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.6148834760259394, + "learning_rate": 4.747783129228656e-05, + "loss": 0.769, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.4715346096559044, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6776, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.6178802722197313, + "learning_rate": 4.718404360058966e-05, + "loss": 0.8642, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.6860616728128789, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.8978, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.5622536592324762, + "learning_rate": 4.689088677427249e-05, + "loss": 0.8318, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.6339672235931774, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.8077, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.6337318821383485, + "learning_rate": 4.659836431497563e-05, + "loss": 0.8657, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.5440546726106485, + "learning_rate": 4.645234206515171e-05, + "loss": 0.8027, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.4885364928818766, + "learning_rate": 4.630647971676232e-05, + "loss": 0.7567, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.5547202784254522, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.7392, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5522379529790205, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7507, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.5014160980658192, + "learning_rate": 4.586985643347717e-05, + "loss": 0.6989, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.48194452166869545, + "learning_rate": 4.572463804170263e-05, + "loss": 0.7215, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.5414145480932072, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7474, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.5363974976619018, + "learning_rate": 4.543468791472131e-05, + "loss": 0.6966, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.5315348555098458, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.7057, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5496948104991902, + "learning_rate": 4.514538954847064e-05, + "loss": 0.9065, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.43605694299155395, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.6293, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.46088868134026584, + "learning_rate": 4.485674639850333e-05, + "loss": 0.7132, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.4649796978337266, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7358, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.4418888859085772, + "learning_rate": 4.456876191254582e-05, + "loss": 0.7071, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.5046766627277882, + "learning_rate": 4.442501774383515e-05, + "loss": 0.7768, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4557547083973194, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7131, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.39055024125647814, + "learning_rate": 4.413802770115816e-05, + "loss": 0.6099, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.45005225206007193, + "learning_rate": 4.399478268418771e-05, + "loss": 0.7233, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4326733656972398, + "learning_rate": 4.385170490729712e-05, + "loss": 0.634, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.5376495961296939, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.8217, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.3553498079363257, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6513, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4831175795186089, + "learning_rate": 4.342347928711953e-05, + "loss": 0.719, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.4580642378892101, + "learning_rate": 4.328107473805487e-05, + "loss": 0.6152, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.6353832067445115, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.7743, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.7433318010308233, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.7238, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.5120089821147191, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.6953, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.5530914964555543, + "learning_rate": 4.271315449981934e-05, + "loss": 0.7672, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.446703431755806, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7543, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.4985683155201103, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.763, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.4447455806296091, + "learning_rate": 4.228900904120895e-05, + "loss": 0.7282, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.524732558471761, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.793, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.5169358793386757, + "learning_rate": 4.200710636738189e-05, + "loss": 0.723, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.5670401797972314, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.699, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.44486036937631557, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7626, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.5124037474686822, + "learning_rate": 4.158555222253771e-05, + "loss": 0.7847, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.44326342488274656, + "learning_rate": 4.14453824841132e-05, + "loss": 0.6848, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.557505037931972, + "learning_rate": 4.130538759866457e-05, + "loss": 0.777, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.5183138109894138, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.7005, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.477675504552373, + "learning_rate": 4.102592405835536e-05, + "loss": 0.7449, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5426554793530708, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6782, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.44255186163158894, + "learning_rate": 4.074716493968975e-05, + "loss": 0.6841, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.3863902721025659, + "learning_rate": 4.060805057932359e-05, + "loss": 0.6402, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.3884260682205065, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6679, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.48546800516790917, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.7065, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.45241040521383147, + "learning_rate": 4.019177327749822e-05, + "loss": 0.6521, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5625006140730437, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7623, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.5578660711528415, + "learning_rate": 3.991514736790258e-05, + "loss": 0.7355, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.46600670807627503, + "learning_rate": 3.977710334046193e-05, + "loss": 0.7163, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5593721202886864, + "learning_rate": 3.963923914773187e-05, + "loss": 0.8431, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.40681967664480323, + "learning_rate": 3.950155520139581e-05, + "loss": 0.6492, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.5289946303807045, + "learning_rate": 3.936405191259891e-05, + "loss": 0.7155, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.48816964803481955, + "learning_rate": 3.922672969194686e-05, + "loss": 0.8317, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.5769581385100243, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.8158, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.4771966205047752, + "learning_rate": 3.895263009479534e-05, + "loss": 0.6407, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.4979329202373666, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7623, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.5001062879658064, + "learning_rate": 3.867925968395085e-05, + "loss": 0.719, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.506521075551192, + "learning_rate": 3.854284894414122e-05, + "loss": 0.6791, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.5503240252563162, + "learning_rate": 3.840662172471315e-05, + "loss": 0.8363, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.5162890993641392, + "learning_rate": 3.82705784324618e-05, + "loss": 0.7164, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.5295010839522402, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.6623, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.5019958403606616, + "learning_rate": 3.79990452539225e-05, + "loss": 0.7614, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.49783319519372066, + "learning_rate": 3.786355617847385e-05, + "loss": 0.76, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.47246774450334006, + "learning_rate": 3.772825265187802e-05, + "loss": 0.6311, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.49908252295917693, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7004, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.5023872903425932, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.7962, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.5296038386612373, + "learning_rate": 3.732345940279893e-05, + "loss": 0.7145, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4801478395862959, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7896, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.47532808111907654, + "learning_rate": 3.705453237352227e-05, + "loss": 0.6979, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.6487872517751047, + "learning_rate": 3.692035060534088e-05, + "loss": 0.9316, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.45283259562586164, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7261, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.5359049357600549, + "learning_rate": 3.665255256532638e-05, + "loss": 0.7996, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.57903559949146, + "learning_rate": 3.651893709317887e-05, + "loss": 0.7025, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.42218551646518954, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6646, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.4729458889883054, + "learning_rate": 3.625227523958252e-05, + "loss": 0.6803, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.5367830113665476, + "learning_rate": 3.611922965442648e-05, + "loss": 0.7685, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3904123291334064, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6442, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.5182504791183655, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.7414, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.5489916184826816, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.7705, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.5345518887498807, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7244, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.39855204806143646, + "learning_rate": 3.545687101972013e-05, + "loss": 0.7069, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.38343271970491233, + "learning_rate": 3.53249759200601e-05, + "loss": 0.6327, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.5320127876609486, + "learning_rate": 3.519327394983888e-05, + "loss": 0.8237, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5319164206608027, + "learning_rate": 3.506176550233863e-05, + "loss": 0.7438, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.4475564619340168, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.7102, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.5204112631420996, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7062, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.5665374236680606, + "learning_rate": 3.46684052203088e-05, + "loss": 0.725, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.49320064737854913, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.8121, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.6305004785407522, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7841, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.5103737337145858, + "learning_rate": 3.427680074531113e-05, + "loss": 0.7701, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.5587074934448885, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.8189, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.608655438526152, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7247, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.3973307572980837, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6174, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.5380683436519683, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.7722, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5235736873409991, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7243, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.4668120644303702, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.6957, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.4843637052915313, + "learning_rate": 3.336994413891828e-05, + "loss": 0.7176, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.36948340124858087, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6135, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.3817406780247521, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.5869, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.5036523649989718, + "learning_rate": 3.298426809706928e-05, + "loss": 0.65, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.6680926557764835, + "learning_rate": 3.285610914348332e-05, + "loss": 0.777, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.4547140456205486, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.6212, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.4760441911558683, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.6633, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.4472468798232282, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6992, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.541400706662573, + "learning_rate": 3.234548216567049e-05, + "loss": 0.8108, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.5837970062841571, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.7361, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.5825471471342758, + "learning_rate": 3.209137931341143e-05, + "loss": 0.8008, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.49119367337757014, + "learning_rate": 3.196463187590929e-05, + "loss": 0.671, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.5146132969965356, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.7137, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.45711650738707627, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7292, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.5450920433168046, + "learning_rate": 3.158561005793402e-05, + "loss": 0.7085, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.4880516059939469, + "learning_rate": 3.145967754102691e-05, + "loss": 0.7886, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5540779020706869, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7657, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.4985735020980697, + "learning_rate": 3.120842689807468e-05, + "loss": 0.711, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.4345770921338324, + "learning_rate": 3.108310952230212e-05, + "loss": 0.6625, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.39460650068404723, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6976, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.5785645508518101, + "learning_rate": 3.083309253324651e-05, + "loss": 0.8539, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.9136536474478617, + "learning_rate": 3.070839366655215e-05, + "loss": 0.7693, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.7312061473447439, + "learning_rate": 3.058390171511196e-05, + "loss": 0.9283, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.4833939545743598, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.7393, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.5777872168268464, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6804, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.7351180136300047, + "learning_rate": 3.021167106673928e-05, + "loss": 0.8204, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.5079474619126504, + "learning_rate": 3.008801048763914e-05, + "loss": 0.7901, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.45629376236150493, + "learning_rate": 2.996455867635155e-05, + "loss": 0.7184, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.5133009909462517, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.8571, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.4853061726082453, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6878, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.7062162795938509, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.8062, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.5187628854617226, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.7226, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.44021219651503096, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6203, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.5201824428298304, + "learning_rate": 2.922825253307947e-05, + "loss": 0.7495, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.9047447233905686, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7796, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.38481153766405335, + "learning_rate": 2.898450393337977e-05, + "loss": 0.6882, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.44462652136571124, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.7444, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.5055843010425, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6945, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.5534967200400872, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.8019, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.506842202837792, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.7231, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.47564544136627496, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7637, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.46953984846854974, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.6723, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.46835967280477037, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.6714, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.5744851355888679, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.7623, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.46780492927444983, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.6781, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.6108339290589371, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.7367, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4200445732358918, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6772, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.44003838748442664, + "learning_rate": 2.753992680872457e-05, + "loss": 0.8008, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.5101369359628932, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.7532, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.49022908128800663, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6776, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.5011160457821283, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.7838, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.5549048902973877, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.7788, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.46499151041580006, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6757, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.5227086813836328, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.611, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.5269337175198842, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.7644, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5937431269608971, + "learning_rate": 2.659414712405398e-05, + "loss": 0.7407, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.7069015746962616, + "learning_rate": 2.647690737490106e-05, + "loss": 0.7024, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.4820769595037139, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.7244, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.6025833896320884, + "learning_rate": 2.6243086879379e-05, + "loss": 0.835, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.5122563531231694, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.7097, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.43897546891764955, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6709, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.47938233123497537, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6765, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.4074266138722427, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6932, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.5324500134388179, + "learning_rate": 2.566239608465838e-05, + "loss": 0.7736, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.5665658629227085, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7511, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.5028946018469815, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6953, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.49206414908888135, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.6946, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.5304755577081713, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7475, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.6393735675642492, + "learning_rate": 2.508725484101684e-05, + "loss": 0.8167, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.513773800806844, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.6585, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5662132456840565, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7496, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.48634830493495995, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.7295, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.43229534390856567, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.7104, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3938232209502425, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6811, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.5132216213085693, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.737, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.4195156568509407, + "learning_rate": 2.429146201687538e-05, + "loss": 0.7066, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.40033997960486356, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6818, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.5373976326564892, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.7328, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.45754895410055296, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.6853, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4575813475049241, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.5971, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.5451849398635704, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.7032, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.5441774619324102, + "learning_rate": 2.361816641743303e-05, + "loss": 0.7331, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.39524876304033657, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6186, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.41980029309219075, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6764, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.4290035490569862, + "learning_rate": 2.328459328616759e-05, + "loss": 0.7402, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.4344070777605754, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6437, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.41828191602597053, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6458, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.5392793424158163, + "learning_rate": 2.295308190543859e-05, + "loss": 0.7386, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4887441003063013, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6862, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.6256757383610962, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.8432, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.5464595903115794, + "learning_rate": 2.262364118471805e-05, + "loss": 0.7301, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.5638335821204267, + "learning_rate": 2.251428928971102e-05, + "loss": 0.7447, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.5525261367409167, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.8391, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.4953289966782963, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.6948, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.45134057631510055, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6883, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.42700165725759, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6537, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.49804296009133825, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.7491, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.4871235507358856, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6385, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.5887939524755246, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.8623, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.5463547126142305, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.7247, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4681078387739679, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.7115, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.5399798111944335, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6925, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.5538299714024002, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.7917, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.4655161305883387, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.7605, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.46046084755303923, + "learning_rate": 2.111388852214001e-05, + "loss": 0.7305, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.6432362937427276, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.7866, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5233846063949636, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7926, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.5790472384084447, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.675, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.4714697559149237, + "learning_rate": 2.069097260929439e-05, + "loss": 0.7253, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4270576465559357, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6784, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.5188902875539768, + "learning_rate": 2.048093436450603e-05, + "loss": 0.7824, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.6367857125346924, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.8625, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.4906668326163334, + "learning_rate": 2.027184594300898e-05, + "loss": 0.7062, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.5277813993514586, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.8342, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.47983067644242744, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.7006, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.43792461864947296, + "learning_rate": 1.995999968955641e-05, + "loss": 0.7622, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.4916156036073879, + "learning_rate": 1.985652854842247e-05, + "loss": 0.7127, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.5768343416347183, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.7781, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4569625310327317, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6978, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.4554160260672957, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.6804, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.39705504892736143, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.6464, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.5352078742401196, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6905, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.5554651283839609, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.7253, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.45444727275032104, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.7549, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.49208812313330785, + "learning_rate": 1.903740076395151e-05, + "loss": 0.7315, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.4950480797316589, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.7494, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.5498653769016123, + "learning_rate": 1.883503039577894e-05, + "loss": 0.7609, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.49438895303259683, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7298, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.3536975783106019, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.6478, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.603891234009926, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.7822, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.456745084908359, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7652, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.4775098878163809, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.7414, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.5057797701774622, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.7019, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.4220879754209838, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6576, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.5444032294164403, + "learning_rate": 1.803526775107217e-05, + "loss": 0.8257, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.481955756353624, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.7183, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4362237396864434, + "learning_rate": 1.783776873795994e-05, + "loss": 0.7494, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.5170191679170411, + "learning_rate": 1.773938710748706e-05, + "loss": 0.7928, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.5080971785760765, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.6409, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.5161371330680553, + "learning_rate": 1.754336106761927e-05, + "loss": 0.7773, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.5304994174906112, + "learning_rate": 1.744571724358789e-05, + "loss": 0.7557, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.4454709693189586, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.7259, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.49547457346470486, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7042, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.3729345692266381, + "learning_rate": 1.715426605184407e-05, + "loss": 0.6745, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.50135253803681, + "learning_rate": 1.705761004839911e-05, + "loss": 0.7581, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.6832387692102796, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7099, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.44670629381081794, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.7218, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.5226987026394574, + "learning_rate": 1.676912926028007e-05, + "loss": 0.8117, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.4598291624037757, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6637, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.4829505048181646, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.7263, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.48790533521950497, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.7045, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.413581741770677, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6852, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.4483225785193507, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.6595, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.5535810560099256, + "learning_rate": 1.619888594394382e-05, + "loss": 0.7033, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.6466696989559793, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.832, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.46938949706307914, + "learning_rate": 1.601080376443763e-05, + "loss": 0.605, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.5142034808023143, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.6822, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.5907942946614096, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.8282, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.498830697777462, + "learning_rate": 1.573056222621453e-05, + "loss": 0.7772, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.5974015247062966, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.7316, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.5259485131736402, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7492, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.5107853708990638, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.7255, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.5795296746970275, + "learning_rate": 1.536043110654809e-05, + "loss": 0.7327, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.48555881447046184, + "learning_rate": 1.526852950422226e-05, + "loss": 0.8639, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.4531193486352657, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.7119, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.513527134320427, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.7414, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5954943410632516, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7679, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.5613384242425445, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.7713, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.4444495696459105, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.7333, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.6737660720946029, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7759, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.5986316877783368, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.7094, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.3811464696827495, + "learning_rate": 1.454244833620102e-05, + "loss": 0.6575, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.38979442506119516, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6019, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.5112200475446241, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.6747, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.4925816723698563, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.7111, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.5026870895601208, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7843, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.5327465316476543, + "learning_rate": 1.409693244743192e-05, + "loss": 0.794, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.5553229160368591, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.702, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.7168884703670975, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.8406, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.609620211368279, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.8081, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.5099940552094323, + "learning_rate": 1.37451354812416e-05, + "loss": 0.6834, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4236752960093711, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6256, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.6151341652474434, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.8692, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.5796419138079855, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.7776, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.5134552362210623, + "learning_rate": 1.339745962155613e-05, + "loss": 0.712, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.49380865507584276, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6452, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.5081933379437217, + "learning_rate": 1.322517230541096e-05, + "loss": 0.8112, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.5121736787964045, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.7272, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.48009551155070473, + "learning_rate": 1.30539214797198e-05, + "loss": 0.661, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.48465583201319534, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.6784, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.685254036551879, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7132, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.44102035361865277, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.7113, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.4994805408242732, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.711, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.5602382620403693, + "learning_rate": 1.263034245443473e-05, + "loss": 0.8379, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.5671929880193168, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.7182, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.4630358554367751, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6504, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.5289835274851352, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7377, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.47775023907270436, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.7138, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.5515531883262018, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.6868, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.44824653933517705, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6341, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.41486994233454416, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.634, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.4545108148817928, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.6034, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.4489533834839864, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.5554, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.5405287066083088, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.7041, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.4972187285317307, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.7534, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.5154262619152171, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6712, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.3663032585790623, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.5763, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.7023035553436592, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.7126, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.457900813666415, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7207, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.5554462953531683, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.8523, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.5480430788653078, + "learning_rate": 1.123914688596409e-05, + "loss": 0.7748, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.5988734876106011, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.7738, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.45939067209244255, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.6841, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.6290835230461248, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.8871, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.5737171101704664, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7974, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.5603477018432383, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.6972, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.4958360653147047, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.7337, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.4948548812131531, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6741, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.559838660552377, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.7608, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.40902355723274353, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.699, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.504641703128184, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6953, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.5149518116555113, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.7222, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.46055043846863936, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.7279, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.6861137346352174, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.9197, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.6202906634173873, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.9064, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.6052956382369696, + "learning_rate": 1.007519208596045e-05, + "loss": 0.816, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5092774321360481, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7326, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.55870392357648, + "learning_rate": 9.924546254786493e-06, + "loss": 0.7598, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 1.0762146408116535, + "learning_rate": 9.849626695403324e-06, + "loss": 0.6783, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4095981231341868, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6808, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.44874326314271656, + "learning_rate": 9.700595407649805e-06, + "loss": 0.6651, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.5942177582265716, + "learning_rate": 9.62648412430951e-06, + "loss": 0.6917, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.5073765704372167, + "learning_rate": 9.552642710005299e-06, + "loss": 0.7164, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.3834286436527206, + "learning_rate": 9.479071385238892e-06, + "loss": 0.6655, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.4142314259895984, + "learning_rate": 9.40577036970538e-06, + "loss": 0.6932, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.4614074512356123, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7713, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.4282735890102852, + "learning_rate": 9.259980141081115e-06, + "loss": 0.6883, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.45682689964176815, + "learning_rate": 9.187491363342093e-06, + "loss": 0.7074, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.49523647877393767, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6899, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.4242852673300794, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6641, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.4348634746466489, + "learning_rate": 8.971652971536148e-06, + "loss": 0.631, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.47104771509485044, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6354, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.6595089288899116, + "learning_rate": 8.829119474567671e-06, + "loss": 0.7565, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.45446723321640703, + "learning_rate": 8.758260995011825e-06, + "loss": 0.7437, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.39749840642328826, + "learning_rate": 8.687674977138116e-06, + "loss": 0.652, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.418676467578698, + "learning_rate": 8.617361631727138e-06, + "loss": 0.6729, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.4167017082623031, + "learning_rate": 8.547321168745193e-06, + "loss": 0.6532, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.5261322915065519, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6475, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.47291652632472153, + "learning_rate": 8.408059725858719e-06, + "loss": 0.6616, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.4231944470460595, + "learning_rate": 8.338839161809997e-06, + "loss": 0.7635, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.49601636170391883, + "learning_rate": 8.269892311900696e-06, + "loss": 0.652, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.4860629505490482, + "learning_rate": 8.201219382016556e-06, + "loss": 0.6797, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.537269565059725, + "learning_rate": 8.132820577225387e-06, + "loss": 0.7567, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.5361527148663102, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7547, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.40056536356318667, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6737, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.8346711580260499, + "learning_rate": 7.929270951805178e-06, + "loss": 0.7156, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.6127306118234916, + "learning_rate": 7.861970681683051e-06, + "loss": 0.8728, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.5128203754782934, + "learning_rate": 7.794945549701993e-06, + "loss": 0.6896, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.4633092483624061, + "learning_rate": 7.728195756009204e-06, + "loss": 0.7922, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.45462026648594467, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6976, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.42808648619396766, + "learning_rate": 7.595522979965819e-06, + "loss": 0.677, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.4651930398760335, + "learning_rate": 7.529600393796232e-06, + "loss": 0.6459, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.5187080741739207, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6837, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.49920522537938794, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.6918, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.4287234489018753, + "learning_rate": 7.333490202478666e-06, + "loss": 0.6631, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.5796082685254073, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.7036, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.41699761236237465, + "learning_rate": 7.204133330911178e-06, + "loss": 0.658, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.40963255834753687, + "learning_rate": 7.1398704525792e-06, + "loss": 0.618, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4266669253277369, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6551, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.500144255977364, + "learning_rate": 7.012176770311862e-06, + "loss": 0.622, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.6423284317172347, + "learning_rate": 6.948746347689183e-06, + "loss": 0.8343, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4785552023213915, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6753, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.5236688032524741, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.7347, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.4491627730242979, + "learning_rate": 6.760123024328624e-06, + "loss": 0.679, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.5533506587760416, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.8003, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.5595371188782717, + "learning_rate": 6.635765971293484e-06, + "loss": 0.7421, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.4560730862550497, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.7176, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.44371075720210884, + "learning_rate": 6.512524116523633e-06, + "loss": 0.7023, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.5542076134403445, + "learning_rate": 6.451321849032288e-06, + "loss": 0.7239, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.48346860118408197, + "learning_rate": 6.390398932093555e-06, + "loss": 0.7135, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.6524116192763804, + "learning_rate": 6.329755547632499e-06, + "loss": 0.7735, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.45036943940470364, + "learning_rate": 6.269391876739495e-06, + "loss": 0.7418, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.8386021628329972, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6556, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.5626439893049194, + "learning_rate": 6.149504395842087e-06, + "loss": 0.748, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.5415177761468162, + "learning_rate": 6.089980943839924e-06, + "loss": 0.8099, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.5112037404168135, + "learning_rate": 6.030737921409169e-06, + "loss": 0.7068, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.6180075417601816, + "learning_rate": 5.971775505458444e-06, + "loss": 0.8498, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.5179797392869975, + "learning_rate": 5.913093872058528e-06, + "loss": 0.7522, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.4459293336057215, + "learning_rate": 5.854693196441641e-06, + "loss": 0.7184, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5235429666240141, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7367, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.44471026541607844, + "learning_rate": 5.738735415290642e-06, + "loss": 0.752, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.601396377057578, + "learning_rate": 5.681178656024055e-06, + "loss": 0.7916, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.5121445197696045, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6208, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.4603280036865616, + "learning_rate": 5.566910259474289e-06, + "loss": 0.6457, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.47236374997848657, + "learning_rate": 5.510198963413881e-06, + "loss": 0.7362, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.42287209768869616, + "learning_rate": 5.453769828241872e-06, + "loss": 0.7348, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.42712644073449385, + "learning_rate": 5.397623022464226e-06, + "loss": 0.7161, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.48340298121129716, + "learning_rate": 5.341758713743828e-06, + "loss": 0.6387, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.5658462960948197, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6593, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.424084893434337, + "learning_rate": 5.230878253907912e-06, + "loss": 0.6479, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.5398925397600337, + "learning_rate": 5.175862433898282e-06, + "loss": 0.5938, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.606679189727865, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7922, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.510985246916621, + "learning_rate": 5.066680435123106e-06, + "loss": 0.7588, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.4722608230327989, + "learning_rate": 5.012514582391592e-06, + "loss": 0.6713, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.42446110331205, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6643, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.4163265120072367, + "learning_rate": 4.905033978977491e-06, + "loss": 0.5705, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.5475943904187824, + "learning_rate": 4.851719549248301e-06, + "loss": 0.7402, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.5168728639343029, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6617, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.5155818274958027, + "learning_rate": 4.745943229770122e-06, + "loss": 0.7394, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.6213963608647247, + "learning_rate": 4.693481655885257e-06, + "loss": 0.7629, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4450693753993077, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6953, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.49150080427279746, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6962, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.44378449436598577, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6751, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.4266567562385669, + "learning_rate": 4.486482911479839e-06, + "loss": 0.7267, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.5248025026202432, + "learning_rate": 4.435445885824285e-06, + "loss": 0.813, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.5436474839305038, + "learning_rate": 4.384694230432984e-06, + "loss": 0.6394, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3907459439703028, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.5963, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.47358630291017373, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.7958, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.5943288010108304, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.7544, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.42006006444833216, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6587, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.6834227269857183, + "learning_rate": 4.135221781914034e-06, + "loss": 0.7348, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.4744275929462346, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.6711, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.514414629181434, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7577, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.46627494146408927, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6638, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.4953757495199464, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.6732, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.48769603448556365, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7318, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.5257519553325768, + "learning_rate": 3.845303192289074e-06, + "loss": 0.8015, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.468044162237833, + "learning_rate": 3.797987556970495e-06, + "loss": 0.7128, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.5165267345687184, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7722, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.5179373494952317, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.7424, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.5459932728845321, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.7631, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4475477142358694, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6574, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.4678213911961665, + "learning_rate": 3.565721283350931e-06, + "loss": 0.6534, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.4445865890861479, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.6866, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.44496802431382726, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.721, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.5498354517588652, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6678, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.4781268590749618, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.7688, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.4795911897631534, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7867, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.4157086767533849, + "learning_rate": 3.296506110302422e-06, + "loss": 0.6362, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.460482019566879, + "learning_rate": 3.252646840332918e-06, + "loss": 0.8488, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4011290702239954, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6229, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.5335791859343584, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6649, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.48008138017971996, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.6594, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.510030532078524, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.7212, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.6116496219261657, + "learning_rate": 3.037686613916857e-06, + "loss": 0.6314, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.5191573242107503, + "learning_rate": 2.995562691985898e-06, + "loss": 0.7689, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.48011259367440917, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6856, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.8095906956777874, + "learning_rate": 2.912183982969385e-06, + "loss": 0.946, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.6547187796594096, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.7037, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4391012158565832, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6496, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.48372715186069265, + "learning_rate": 2.789290617426765e-06, + "loss": 0.7216, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.551843828007968, + "learning_rate": 2.748906571878207e-06, + "loss": 0.7497, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.3706985805846255, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6008, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.5079646438573806, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.7427, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.5202994781101107, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.7175, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.3951489412205076, + "learning_rate": 2.590275647868867e-06, + "loss": 0.635, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.5548245108473696, + "learning_rate": 2.551344823532964e-06, + "loss": 0.7317, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.49619550117391037, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.6544, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5787425597304093, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7582, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.4821868118943535, + "learning_rate": 2.436298790049363e-06, + "loss": 0.7382, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.49726536415452277, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.7179, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4916875608972585, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7419, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.5013111067249194, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.7238, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.5111579653157136, + "learning_rate": 2.286983355164529e-06, + "loss": 0.7329, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.42180275764096986, + "learning_rate": 2.250383684694579e-06, + "loss": 0.5899, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.4136580821470877, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.6377, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.496226995771142, + "learning_rate": 2.178060137750071e-06, + "loss": 0.7685, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.5285898567821652, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7959, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.4197114796133001, + "learning_rate": 2.106905034576112e-06, + "loss": 0.6642, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.617889865561174, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.8222, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.606292067950159, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7569, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.48358230199811053, + "learning_rate": 2.002365067264289e-06, + "loss": 0.6421, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.7161695624087144, + "learning_rate": 1.968103545249611e-06, + "loss": 0.8082, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4051625738114139, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6526, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.4299181922221351, + "learning_rate": 1.900458817025097e-06, + "loss": 0.6727, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.47701691210897, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.7156, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4466971408145806, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6131, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.47586344574699047, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.7455, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.5333491333238738, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.777, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.43097036219369156, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.686, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.40306995652351624, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.6632, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.5054067762609026, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.6999, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.3771454558465983, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6567, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.5509412959915065, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.7371, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.4482264117064355, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.7454, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.5201668268071643, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.7104, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.4365736211274258, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.7213, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.4606040027591618, + "learning_rate": 1.489364501100332e-06, + "loss": 0.7046, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.37495205500136525, + "learning_rate": 1.459798471131868e-06, + "loss": 0.5511, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.4649202973154337, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6964, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.5162600069616848, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.7756, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.44816010084396685, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7204, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.4857083955103614, + "learning_rate": 1.344477780953346e-06, + "loss": 0.7151, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.4868499988858953, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.6853, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.5366789933159594, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7534, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.5758222815631294, + "learning_rate": 1.261080262743297e-06, + "loss": 0.7595, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.46310850721307445, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.7028, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.5561069134921421, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.8377, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.6560905308494205, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.6879, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.4449699955108516, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6719, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4060169297987321, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6261, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.5230066158044145, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.6871, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.46996784274130565, + "learning_rate": 1.076809502472831e-06, + "loss": 0.7292, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.5050186353073788, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.7157, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.5211608433595872, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6562, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.5379631995615644, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.7603, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5288416301092489, + "learning_rate": 9.780089980330642e-07, + "loss": 0.7771, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.5617692135985728, + "learning_rate": 9.540479264726676e-07, + "loss": 0.8067, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.469019831644795, + "learning_rate": 9.303826211592315e-07, + "loss": 0.7219, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.36186071420054156, + "learning_rate": 9.070131527609604e-07, + "loss": 0.58, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.5514952842601946, + "learning_rate": 8.839395910626213e-07, + "loss": 0.7673, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.47616261536395255, + "learning_rate": 8.611620049653879e-07, + "loss": 0.7325, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.49771928947304866, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7444, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.490805076177822, + "learning_rate": 8.16495030759501e-07, + "loss": 0.6328, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.4505767877704554, + "learning_rate": 7.946057760332193e-07, + "loss": 0.6216, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.546886120714011, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6847, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.4652153899264941, + "learning_rate": 7.517160581569372e-07, + "loss": 0.6592, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.403288371950824, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6105, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5203408586803837, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6982, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.5302648639664964, + "learning_rate": 6.896044142100433e-07, + "loss": 0.7863, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.5999242011265087, + "learning_rate": 6.694935631773258e-07, + "loss": 0.7431, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.5006843905869421, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7036, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.4965402200645782, + "learning_rate": 6.301617681886863e-07, + "loss": 0.783, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.4969460812661328, + "learning_rate": 6.109409416834688e-07, + "loss": 0.784, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.531648603927093, + "learning_rate": 5.920169059947411e-07, + "loss": 0.8132, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.6240854665278419, + "learning_rate": 5.733897176325665e-07, + "loss": 0.8512, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.5580182083189841, + "learning_rate": 5.550594322205504e-07, + "loss": 0.7281, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.4859660233476685, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7066, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.5125505489699571, + "learning_rate": 5.192897883082747e-07, + "loss": 0.762, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.5208541059125794, + "learning_rate": 5.018505366216175e-07, + "loss": 0.6834, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.6417611341342232, + "learning_rate": 4.847084015119574e-07, + "loss": 0.8377, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.39820648268764225, + "learning_rate": 4.678634341683252e-07, + "loss": 0.6743, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.41943172803182305, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6715, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4365793004569459, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6766, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.4757877763567356, + "learning_rate": 4.191120373120749e-07, + "loss": 0.5985, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.5696292967795435, + "learning_rate": 4.034562351727389e-07, + "loss": 0.7671, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.46415712059206915, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6021, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.4882964088303243, + "learning_rate": 3.73036907948543e-07, + "loss": 0.7239, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.5036691213297696, + "learning_rate": 3.582734737004101e-07, + "loss": 0.6801, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.44412768989414975, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6192, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.4334056206380038, + "learning_rate": 3.296392843612273e-07, + "loss": 0.7146, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.5317515881273797, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.7146, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.44830874612596283, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.7163, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.5118343343875779, + "learning_rate": 2.889203328748424e-07, + "loss": 0.6868, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.5362455723338327, + "learning_rate": 2.759428007315212e-07, + "loss": 0.6871, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4433335474125894, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6308, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.42613712933795245, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.6636, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.44864076347431425, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.7606, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5355535529859561, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.7468, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.5484865023173826, + "learning_rate": 2.15522751523467e-07, + "loss": 0.8353, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.5197743045775367, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.7494, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4131281147883286, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.5756, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.49358540741723306, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.7275, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.4289229445657535, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.699, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4987867160996175, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.736, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.44847119443742095, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.6662, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.4404285750263958, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.5925, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.5197926230065087, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.7181, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.44517753678196087, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6479, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.43248993328356955, + "learning_rate": 1.170343437301491e-07, + "loss": 0.6896, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.37888857741562054, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5676, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.4288486528383463, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.6826, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.48226318453468403, + "learning_rate": 9.330275400666332e-08, + "loss": 0.7637, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.45033126161507564, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7054, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.5224814686511968, + "learning_rate": 7.8973337634336e-08, + "loss": 0.6732, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.5040000891510696, + "learning_rate": 7.225618800222877e-08, + "loss": 0.6622, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.42715592838634286, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6281, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.44340509873449113, + "learning_rate": 5.971710613821291e-08, + "loss": 0.7003, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.44648750338781834, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6881, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.42941753278300676, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6005, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.5182651819295042, + "learning_rate": 4.314680098592705e-08, + "loss": 0.7509, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.5228807722686285, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.7386, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.5136503316973929, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6742, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.49047170464661055, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.6734, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.6186312205906592, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.81, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4526584209336811, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7051, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.45617373499579417, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.6902, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.6027724907975999, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.7722, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4700052206922473, + "learning_rate": 1.209367398504746e-08, + "loss": 0.5983, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.7086846695221508, + "learning_rate": 9.555535917993297e-09, + "loss": 1.006, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.6814212714863245, + "learning_rate": 7.315984495548378e-09, + "loss": 0.7931, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.42297711302573726, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6786, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.5850099736471568, + "learning_rate": 3.732667443390181e-09, + "loss": 0.8436, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.42956006902121935, + "learning_rate": 2.388912514017516e-09, + "loss": 0.6895, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4260271478296352, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6554, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.5290206099979772, + "learning_rate": 5.972299119250125e-10, + "loss": 0.6955, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.48785151961800294, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.7176, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.4250442322020013, + "learning_rate": 0.0, + "loss": 0.6854, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1526950804455424.0, + "train_loss": 0.7908816375732421, + "train_runtime": 28102.0472, + "train_samples_per_second": 1.068, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1526950804455424.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..054f4932b83048bbd77dedd425621288b6dbcdfd --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "q_proj", + "up_proj", + "v_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e4f6873f8ffc707bea00278f882935a36aa3c20a --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fba053cf38ab9fd24bf53350693387d55edb1fb204cad8950e80f774cf3b1382 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..28d9d8b407fb624923856103b6cc51eb9cd6be41 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:619757c697845a8c27d4970528301b1a73673034c8ba81f600ecf7ff8403f1e5 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..204aaee9761bb279ae87f756ecbddede99cdad90 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.7125623597453534, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.1919, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.0674347992870037, + "learning_rate": 7.017543859649123e-06, + "loss": 1.2128, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 0.8728510850894342, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.1698, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.9110198903157677, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.3857, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.877247879975666, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.3312, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8140465172895612, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2262, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.895253458979813, + "learning_rate": 2.456140350877193e-05, + "loss": 1.2084, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.7333061355376127, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.1988, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.6931096936857353, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1417, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.803854388768664, + "learning_rate": 3.508771929824561e-05, + "loss": 1.2071, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.6601916733476776, + "learning_rate": 3.859649122807018e-05, + "loss": 0.9522, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8912905302625813, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1968, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.8430962018602007, + "learning_rate": 4.56140350877193e-05, + "loss": 1.1235, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.6550051251096707, + "learning_rate": 4.912280701754386e-05, + "loss": 0.9506, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 1.1453521781894755, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.1559, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.7815021254168898, + "learning_rate": 5.6140350877192984e-05, + "loss": 1.0631, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.8095443644525637, + "learning_rate": 5.9649122807017544e-05, + "loss": 1.1157, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7243647678054499, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0483, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.8661360484668469, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0757, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.9197298869430908, + "learning_rate": 7.017543859649122e-05, + "loss": 1.1066, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7445562450644425, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0975, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.5301275567251413, + "learning_rate": 7.719298245614036e-05, + "loss": 0.9044, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.6628615339362863, + "learning_rate": 8.070175438596491e-05, + "loss": 0.9849, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6189170160418722, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9818, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.6181847218507321, + "learning_rate": 8.771929824561403e-05, + "loss": 1.0158, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.6121104085973382, + "learning_rate": 9.12280701754386e-05, + "loss": 0.8488, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.7523164934717913, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9749, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.7530922285738954, + "learning_rate": 9.824561403508771e-05, + "loss": 1.036, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5356182338531568, + "learning_rate": 0.0001017543859649123, + "loss": 0.7856, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.5288834988820852, + "learning_rate": 0.00010526315789473685, + "loss": 0.848, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.6164517754077974, + "learning_rate": 0.00010877192982456141, + "loss": 0.9562, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.4632419418202054, + "learning_rate": 0.00011228070175438597, + "loss": 0.7896, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.818896020329126, + "learning_rate": 0.00011578947368421053, + "loss": 1.1009, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.7170363974154533, + "learning_rate": 0.00011929824561403509, + "loss": 1.0739, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.6836934050794474, + "learning_rate": 0.00012280701754385965, + "loss": 0.9919, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6643085774792657, + "learning_rate": 0.0001263157894736842, + "loss": 0.9648, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 1.3538917299813327, + "learning_rate": 0.0001298245614035088, + "loss": 1.0665, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.5594299868622598, + "learning_rate": 0.00013333333333333334, + "loss": 0.9335, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5988358395923179, + "learning_rate": 0.0001368421052631579, + "loss": 0.9207, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.5071900127171447, + "learning_rate": 0.00014035087719298245, + "loss": 0.8156, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.5033370206265442, + "learning_rate": 0.00014385964912280703, + "loss": 0.8482, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5750991048540888, + "learning_rate": 0.00014736842105263158, + "loss": 0.9306, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.7797439497339441, + "learning_rate": 0.00015087719298245616, + "loss": 1.0038, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.7397829823836828, + "learning_rate": 0.0001543859649122807, + "loss": 0.9437, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.5147050573707562, + "learning_rate": 0.00015789473684210527, + "loss": 0.8392, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.5508332246098021, + "learning_rate": 0.00016140350877192982, + "loss": 0.8978, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.6240366974915812, + "learning_rate": 0.0001649122807017544, + "loss": 1.0485, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6206028252741724, + "learning_rate": 0.00016842105263157895, + "loss": 0.8915, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.5378699982729577, + "learning_rate": 0.00017192982456140353, + "loss": 0.9918, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.4615725517131061, + "learning_rate": 0.00017543859649122806, + "loss": 0.8364, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.839143978619063, + "learning_rate": 0.00017894736842105264, + "loss": 1.0965, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.791627020793675, + "learning_rate": 0.0001824561403508772, + "loss": 1.0167, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.5353898387675298, + "learning_rate": 0.00018596491228070177, + "loss": 0.8873, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.46155943784236264, + "learning_rate": 0.00018947368421052632, + "loss": 0.8004, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.6060608354221403, + "learning_rate": 0.00019298245614035088, + "loss": 0.9862, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.5540773123940826, + "learning_rate": 0.00019649122807017543, + "loss": 0.9106, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.48593847336045554, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.5628081160242835, + "learning_rate": 0.00019999985069241055, + "loss": 0.9321, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.5569538768407111, + "learning_rate": 0.00019999940277008808, + "loss": 0.9621, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.4733778178712535, + "learning_rate": 0.00019999865623437013, + "loss": 0.805, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.5498882038803397, + "learning_rate": 0.00019999761108748597, + "loss": 0.8731, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.6276025403213903, + "learning_rate": 0.00019999626733255662, + "loss": 0.9597, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.6262714541836627, + "learning_rate": 0.00019999462497359466, + "loss": 0.9889, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.6116976323121694, + "learning_rate": 0.00019999268401550447, + "loss": 0.9198, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.6830838441856558, + "learning_rate": 0.000199990444464082, + "loss": 0.876, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.652455481238001, + "learning_rate": 0.00019998790632601496, + "loss": 0.971, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.7252847928776754, + "learning_rate": 0.00019998506960888256, + "loss": 0.9825, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.6524244782693203, + "learning_rate": 0.00019998193432115572, + "loss": 0.8155, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.7972076010671612, + "learning_rate": 0.0001999785004721968, + "loss": 0.9859, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.6557139336610636, + "learning_rate": 0.00019997476807225985, + "loss": 0.9518, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.5767944519846985, + "learning_rate": 0.0001999707371324904, + "loss": 0.9456, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.7424314830058474, + "learning_rate": 0.00019996640766492543, + "loss": 0.994, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.5590183604518738, + "learning_rate": 0.00019996177968249334, + "loss": 0.8808, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.5326291568201051, + "learning_rate": 0.0001999568531990141, + "loss": 0.8654, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.6344187236250632, + "learning_rate": 0.00019995162822919883, + "loss": 0.9406, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.5164870062084322, + "learning_rate": 0.00019994610478865011, + "loss": 0.8699, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.535560546495017, + "learning_rate": 0.0001999402828938618, + "loss": 0.8904, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.604254569720925, + "learning_rate": 0.00019993416256221895, + "loss": 0.9037, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.6820823738021821, + "learning_rate": 0.00019992774381199778, + "loss": 0.9401, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.6680958377488201, + "learning_rate": 0.00019992102666236566, + "loss": 0.9381, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5953355425943272, + "learning_rate": 0.00019991401113338104, + "loss": 0.9229, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.6303538915978696, + "learning_rate": 0.00019990669724599336, + "loss": 0.9379, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.6555513492310497, + "learning_rate": 0.00019989908502204292, + "loss": 1.082, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5544528105821924, + "learning_rate": 0.00019989117448426108, + "loss": 0.8451, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.551371645828701, + "learning_rate": 0.00019988296565626987, + "loss": 0.7987, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.5663512160236398, + "learning_rate": 0.00019987445856258206, + "loss": 0.8809, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5334254537578645, + "learning_rate": 0.00019986565322860115, + "loss": 0.8859, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.5864713325791271, + "learning_rate": 0.00019985654968062122, + "loss": 0.8676, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.4529120336108661, + "learning_rate": 0.00019984714794582683, + "loss": 0.7923, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.5629961828154081, + "learning_rate": 0.00019983744805229296, + "loss": 0.8905, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.5060733125222548, + "learning_rate": 0.000199827450028985, + "loss": 0.8003, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.6099839802016345, + "learning_rate": 0.00019981715390575858, + "loss": 0.9851, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.7766168781243934, + "learning_rate": 0.00019980655971335945, + "loss": 1.0454, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.6574911191619291, + "learning_rate": 0.00019979566748342347, + "loss": 0.8434, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.5412993314160603, + "learning_rate": 0.00019978447724847652, + "loss": 0.8626, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5318278327058145, + "learning_rate": 0.00019977298904193437, + "loss": 0.9377, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.6418398169575796, + "learning_rate": 0.00019976120289810247, + "loss": 0.9495, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.4584177185767165, + "learning_rate": 0.00019974911885217608, + "loss": 0.8138, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.6544310443497915, + "learning_rate": 0.00019973673694024, + "loss": 0.8797, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.6929605791129708, + "learning_rate": 0.0001997240571992685, + "loss": 0.9384, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.5426845719604184, + "learning_rate": 0.00019971107966712518, + "loss": 0.868, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.6276077620947658, + "learning_rate": 0.00019969780438256293, + "loss": 0.9038, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.610444561137079, + "learning_rate": 0.0001996842313852238, + "loss": 0.9425, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.5085934994219482, + "learning_rate": 0.00019967036071563877, + "loss": 0.8444, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.47254420281476195, + "learning_rate": 0.0001996561924152278, + "loss": 0.7566, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.6359428663755088, + "learning_rate": 0.0001996417265262996, + "loss": 0.8932, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.5833109167273566, + "learning_rate": 0.00019962696309205148, + "loss": 0.9093, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.7101667537550055, + "learning_rate": 0.0001996119021565693, + "loss": 0.9376, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.6655308545817167, + "learning_rate": 0.0001995965437648273, + "loss": 0.9556, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.5239100967263433, + "learning_rate": 0.00019958088796268793, + "loss": 0.8741, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.6492043209126855, + "learning_rate": 0.0001995649347969019, + "loss": 1.0431, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.6215814129519589, + "learning_rate": 0.00019954868431510764, + "loss": 0.9475, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.6740097060761598, + "learning_rate": 0.00019953213656583168, + "loss": 1.0179, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.690783804749006, + "learning_rate": 0.00019951529159848805, + "loss": 0.9117, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.5221887417592358, + "learning_rate": 0.00019949814946337838, + "loss": 0.8273, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.5304131053513017, + "learning_rate": 0.00019948071021169174, + "loss": 0.8758, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.6580880007697816, + "learning_rate": 0.00019946297389550433, + "loss": 1.0554, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.5121534197297978, + "learning_rate": 0.00019944494056777946, + "loss": 0.8894, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.48233888311585627, + "learning_rate": 0.00019942661028236745, + "loss": 0.7437, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.6005642494183314, + "learning_rate": 0.00019940798309400526, + "loss": 0.8986, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.5404763024560123, + "learning_rate": 0.00019938905905831654, + "loss": 0.8837, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.5935111172167694, + "learning_rate": 0.00019936983823181132, + "loss": 0.878, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.6580202309296764, + "learning_rate": 0.0001993503206718859, + "loss": 0.9591, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.783516788896961, + "learning_rate": 0.00019933050643682269, + "loss": 0.9493, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.5888802226903442, + "learning_rate": 0.00019931039558578997, + "loss": 0.8927, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5723859363109641, + "learning_rate": 0.00019928998817884182, + "loss": 0.9258, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.49145024025740786, + "learning_rate": 0.00019926928427691786, + "loss": 0.8225, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.48434420326029637, + "learning_rate": 0.00019924828394184306, + "loss": 0.7424, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.6298233258277486, + "learning_rate": 0.00019922698723632767, + "loss": 0.9409, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.5713270653533685, + "learning_rate": 0.0001992053942239668, + "loss": 0.9058, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.6221411704388371, + "learning_rate": 0.0001991835049692405, + "loss": 0.8763, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5626768128370655, + "learning_rate": 0.00019916131953751342, + "loss": 0.9247, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.5697871650413173, + "learning_rate": 0.0001991388379950346, + "loss": 0.806, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.6608383412606968, + "learning_rate": 0.0001991160604089374, + "loss": 0.9004, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.584120743461091, + "learning_rate": 0.00019909298684723904, + "loss": 0.7749, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.5221342816426137, + "learning_rate": 0.00019906961737884077, + "loss": 0.8579, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.5757818080661625, + "learning_rate": 0.00019904595207352737, + "loss": 0.8815, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.6705434878822246, + "learning_rate": 0.00019902199100196697, + "loss": 0.9205, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.5023728751807374, + "learning_rate": 0.000198997734235711, + "loss": 0.9135, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.6853077151873902, + "learning_rate": 0.00019897318184719385, + "loss": 0.9295, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.7253071327402918, + "learning_rate": 0.00019894833390973266, + "loss": 1.0147, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.6436052857411607, + "learning_rate": 0.0001989231904975272, + "loss": 0.9075, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.6739660650927344, + "learning_rate": 0.00019889775168565943, + "loss": 0.8374, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.691497400214393, + "learning_rate": 0.00019887201755009357, + "loss": 0.9414, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.5369938083375458, + "learning_rate": 0.00019884598816767563, + "loss": 0.8273, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.456701747705484, + "learning_rate": 0.0001988196636161333, + "loss": 0.814, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5059826225091343, + "learning_rate": 0.0001987930439740757, + "loss": 0.8691, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.617153667680866, + "learning_rate": 0.00019876612932099308, + "loss": 0.9196, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.5601388439895788, + "learning_rate": 0.0001987389197372567, + "loss": 0.8726, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.4531004220034202, + "learning_rate": 0.00019871141530411853, + "loss": 0.8086, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.5684981456905152, + "learning_rate": 0.00019868361610371097, + "loss": 0.9698, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.5671637065437304, + "learning_rate": 0.00019865552221904665, + "loss": 0.7799, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.6834264329339218, + "learning_rate": 0.0001986271337340182, + "loss": 1.0069, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.5939675059955585, + "learning_rate": 0.00019859845073339787, + "loss": 0.8895, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.5067830181763635, + "learning_rate": 0.00019856947330283752, + "loss": 0.8352, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5755168088159903, + "learning_rate": 0.00019854020152886814, + "loss": 0.857, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.5914189367976758, + "learning_rate": 0.0001985106354988997, + "loss": 0.8485, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.5161594091088438, + "learning_rate": 0.00019848077530122083, + "loss": 0.9185, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.7476903413837354, + "learning_rate": 0.0001984506210249986, + "loss": 0.9803, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.5232836284681833, + "learning_rate": 0.00019842017276027832, + "loss": 0.8095, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.5151456889733076, + "learning_rate": 0.00019838943059798304, + "loss": 0.8796, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.46469856502654683, + "learning_rate": 0.00019835839462991361, + "loss": 0.7858, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.5299216119887092, + "learning_rate": 0.0001983270649487481, + "loss": 0.8212, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.5053108286419691, + "learning_rate": 0.0001982954416480417, + "loss": 0.7556, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.5808418181585019, + "learning_rate": 0.00019826352482222638, + "loss": 0.862, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.5585096838497986, + "learning_rate": 0.00019823131456661063, + "loss": 0.8968, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.5119764000821178, + "learning_rate": 0.00019819881097737915, + "loss": 0.8536, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5870047182817826, + "learning_rate": 0.00019816601415159263, + "loss": 0.8515, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.7397478155545474, + "learning_rate": 0.00019813292418718732, + "loss": 1.097, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.4864276836247347, + "learning_rate": 0.0001980995411829749, + "loss": 0.8035, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5744790894501604, + "learning_rate": 0.0001980658652386421, + "loss": 0.929, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.5799086997777948, + "learning_rate": 0.0001980318964547504, + "loss": 0.8759, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.625810635582127, + "learning_rate": 0.0001979976349327357, + "loss": 0.9066, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.6912852183983189, + "learning_rate": 0.00019796308077490817, + "loss": 0.9587, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.5504811720653686, + "learning_rate": 0.00019792823408445174, + "loss": 0.8924, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.5509328554394032, + "learning_rate": 0.0001978930949654239, + "loss": 0.8583, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.8655289083781732, + "learning_rate": 0.00019785766352275542, + "loss": 1.0595, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.7231047016289638, + "learning_rate": 0.00019782193986224995, + "loss": 1.0298, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.5661017436347039, + "learning_rate": 0.00019778592409058378, + "loss": 0.8955, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.7547569870104339, + "learning_rate": 0.00019774961631530545, + "loss": 0.9419, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.4770486725682064, + "learning_rate": 0.0001977130166448355, + "loss": 0.8131, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.6187581030776353, + "learning_rate": 0.00019767612518846608, + "loss": 0.8965, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.5163544468105237, + "learning_rate": 0.00019763894205636072, + "loss": 0.8454, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.5793645359884587, + "learning_rate": 0.00019760146735955388, + "loss": 0.7825, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.5436027110758196, + "learning_rate": 0.00019756370120995066, + "loss": 0.8983, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5047471384761516, + "learning_rate": 0.00019752564372032657, + "loss": 0.864, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.6023418503550535, + "learning_rate": 0.000197487295004327, + "loss": 0.8629, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.6195352633209115, + "learning_rate": 0.00019744865517646706, + "loss": 0.7857, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.6361948905586187, + "learning_rate": 0.00019740972435213115, + "loss": 0.8491, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.7444682066816518, + "learning_rate": 0.0001973705026475726, + "loss": 0.9496, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.686742446118089, + "learning_rate": 0.00019733099017991341, + "loss": 0.9093, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5113379352699342, + "learning_rate": 0.00019729118706714375, + "loss": 0.7989, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.5049749919705699, + "learning_rate": 0.0001972510934281218, + "loss": 0.8463, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.47457260638752874, + "learning_rate": 0.00019721070938257324, + "loss": 0.8252, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.548001405270674, + "learning_rate": 0.00019717003505109095, + "loss": 0.8478, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.5765836534456652, + "learning_rate": 0.0001971290705551347, + "loss": 0.9433, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.5510096349707276, + "learning_rate": 0.00019708781601703065, + "loss": 0.7919, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.47654226994156895, + "learning_rate": 0.00019704627155997108, + "loss": 0.7935, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.650131696544161, + "learning_rate": 0.00019700443730801413, + "loss": 0.9422, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.6350761406004101, + "learning_rate": 0.00019696231338608316, + "loss": 0.9221, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.45138773459013287, + "learning_rate": 0.00019691989991996663, + "loss": 0.7349, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.4085582171559744, + "learning_rate": 0.00019687719703631755, + "loss": 0.724, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.5614815222053043, + "learning_rate": 0.00019683420486265327, + "loss": 0.8807, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.6353069654484065, + "learning_rate": 0.0001967909235273549, + "loss": 0.9196, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.5493836037313905, + "learning_rate": 0.0001967473531596671, + "loss": 0.8658, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.6096807278667714, + "learning_rate": 0.0001967034938896976, + "loss": 0.8914, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5877456530312468, + "learning_rate": 0.00019665934584841682, + "loss": 0.9244, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.7394221123604389, + "learning_rate": 0.0001966149091676575, + "loss": 1.0139, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.47483691931477484, + "learning_rate": 0.00019657018398011434, + "loss": 0.7767, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.5337733477737856, + "learning_rate": 0.00019652517041934356, + "loss": 0.8779, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.5551043332116106, + "learning_rate": 0.00019647986861976246, + "loss": 0.8515, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.5753532122700535, + "learning_rate": 0.0001964342787166491, + "loss": 0.8425, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5515137414835763, + "learning_rate": 0.00019638840084614182, + "loss": 0.8717, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.5989747854092063, + "learning_rate": 0.0001963422351452389, + "loss": 0.8948, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.5690350553956068, + "learning_rate": 0.0001962957817517982, + "loss": 0.9121, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.535263534612997, + "learning_rate": 0.00019624904080453655, + "loss": 0.7511, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.5367622148610552, + "learning_rate": 0.00019620201244302952, + "loss": 0.8927, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.4765297941650462, + "learning_rate": 0.00019615469680771096, + "loss": 0.8122, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.46220681275412034, + "learning_rate": 0.00019610709403987246, + "loss": 0.7763, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.5408068249788149, + "learning_rate": 0.00019605920428166323, + "loss": 0.9132, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.4997243415023694, + "learning_rate": 0.00019601102767608923, + "loss": 0.8844, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5151314108392654, + "learning_rate": 0.00019596256436701324, + "loss": 0.7692, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.611846646200774, + "learning_rate": 0.00019591381449915397, + "loss": 0.9588, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.6200993223544174, + "learning_rate": 0.00019586477821808597, + "loss": 0.9916, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.5448278768368114, + "learning_rate": 0.000195815455670239, + "loss": 0.7907, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.5276761208069791, + "learning_rate": 0.00019576584700289768, + "loss": 0.7751, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.5080078311818583, + "learning_rate": 0.00019571595236420102, + "loss": 0.8814, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.49117795723543123, + "learning_rate": 0.00019566577190314197, + "loss": 0.8158, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.562514031292076, + "learning_rate": 0.00019561530576956703, + "loss": 0.8298, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.5797678566569934, + "learning_rate": 0.00019556455411417573, + "loss": 0.8828, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.6577166703095476, + "learning_rate": 0.0001955135170885202, + "loss": 0.9074, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.48373898945054816, + "learning_rate": 0.00019546219484500475, + "loss": 0.7984, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.5307741985891761, + "learning_rate": 0.00019541058753688538, + "loss": 0.8549, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.6142434497799985, + "learning_rate": 0.00019535869531826937, + "loss": 0.9496, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.5938697329821294, + "learning_rate": 0.00019530651834411474, + "loss": 0.9931, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.658751929512093, + "learning_rate": 0.00019525405677022989, + "loss": 0.9422, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5596776517350291, + "learning_rate": 0.00019520131075327298, + "loss": 0.9411, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.5238208832588955, + "learning_rate": 0.0001951482804507517, + "loss": 0.8706, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.4895294860974578, + "learning_rate": 0.00019509496602102252, + "loss": 0.7881, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.6272015233556667, + "learning_rate": 0.00019504136762329047, + "loss": 0.7681, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.45760205070554244, + "learning_rate": 0.00019498748541760846, + "loss": 0.8156, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.4664629500963827, + "learning_rate": 0.0001949333195648769, + "loss": 0.8131, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.5458988273906292, + "learning_rate": 0.00019487887022684336, + "loss": 0.8886, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.5440558602615054, + "learning_rate": 0.00019482413756610173, + "loss": 0.8039, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.6619631460221503, + "learning_rate": 0.0001947691217460921, + "loss": 0.9791, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5381141507925012, + "learning_rate": 0.00019471382293110003, + "loss": 0.8854, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.5664167012076495, + "learning_rate": 0.00019465824128625617, + "loss": 0.9155, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.541046429290967, + "learning_rate": 0.00019460237697753577, + "loss": 0.8782, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5453642655002823, + "learning_rate": 0.00019454623017175812, + "loss": 0.9673, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.5697833022573667, + "learning_rate": 0.00019448980103658613, + "loss": 0.9215, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.47879164860636253, + "learning_rate": 0.0001944330897405257, + "loss": 0.843, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.6495951621386878, + "learning_rate": 0.00019437609645292546, + "loss": 0.9015, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.4636656153460495, + "learning_rate": 0.00019431882134397598, + "loss": 0.8059, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.6725516324126248, + "learning_rate": 0.00019426126458470936, + "loss": 0.8878, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.562661589535857, + "learning_rate": 0.0001942034263469989, + "loss": 0.8702, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.6505106114904452, + "learning_rate": 0.00019414530680355837, + "loss": 1.0138, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.4574309778395966, + "learning_rate": 0.00019408690612794148, + "loss": 0.7697, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.554929549141918, + "learning_rate": 0.00019402822449454153, + "loss": 0.8963, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.6040855008818894, + "learning_rate": 0.00019396926207859084, + "loss": 0.9157, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.5905603233723661, + "learning_rate": 0.0001939100190561601, + "loss": 0.8624, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.6270065120847444, + "learning_rate": 0.00019385049560415794, + "loss": 0.9767, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.5514306602953485, + "learning_rate": 0.0001937906919003304, + "loss": 0.8357, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.4374748974270059, + "learning_rate": 0.00019373060812326052, + "loss": 0.7702, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5204322990650728, + "learning_rate": 0.00019367024445236754, + "loss": 0.8523, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.5033734052047955, + "learning_rate": 0.00019360960106790643, + "loss": 0.8098, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.7935532163140239, + "learning_rate": 0.0001935486781509677, + "loss": 0.9774, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.5412100673639471, + "learning_rate": 0.00019348747588347637, + "loss": 0.9287, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.5219454497503629, + "learning_rate": 0.00019342599444819168, + "loss": 0.7984, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.6061628378117733, + "learning_rate": 0.00019336423402870653, + "loss": 0.9205, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.5161486477334278, + "learning_rate": 0.00019330219480944694, + "loss": 0.7802, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.9075205901276785, + "learning_rate": 0.0001932398769756714, + "loss": 1.1094, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.5262421682921276, + "learning_rate": 0.0001931772807134704, + "loss": 0.843, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.44567338043003074, + "learning_rate": 0.00019311440620976597, + "loss": 0.8223, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.6767324917049274, + "learning_rate": 0.00019305125365231084, + "loss": 0.9738, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.8764162817484429, + "learning_rate": 0.00019298782322968815, + "loss": 1.1272, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.535796392615648, + "learning_rate": 0.0001929241151313108, + "loss": 0.8814, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.628205380189567, + "learning_rate": 0.0001928601295474208, + "loss": 0.925, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.6255289735241575, + "learning_rate": 0.00019279586666908884, + "loss": 0.9198, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.6645268896556138, + "learning_rate": 0.00019273132668821364, + "loss": 0.86, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.5935261617288775, + "learning_rate": 0.00019266650979752136, + "loss": 0.903, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.5707801546251843, + "learning_rate": 0.00019260141619056507, + "loss": 0.8798, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4906465515099141, + "learning_rate": 0.00019253604606172417, + "loss": 0.772, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.62594135393271, + "learning_rate": 0.0001924703996062038, + "loss": 0.9959, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.585052245345864, + "learning_rate": 0.0001924044770200342, + "loss": 0.8952, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.4802588033010636, + "learning_rate": 0.00019233827850007027, + "loss": 0.7456, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.6392170407300541, + "learning_rate": 0.0001922718042439908, + "loss": 0.9384, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.4963931621308698, + "learning_rate": 0.000192205054450298, + "loss": 0.834, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5826452587701353, + "learning_rate": 0.00019213802931831696, + "loss": 0.8792, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.5612821833468113, + "learning_rate": 0.00019207072904819486, + "loss": 0.8895, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.4528717380241006, + "learning_rate": 0.00019200315384090044, + "loss": 0.8122, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.5375315189417282, + "learning_rate": 0.00019193530389822363, + "loss": 0.9046, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.5305721749962367, + "learning_rate": 0.00019186717942277462, + "loss": 0.8452, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.5154352397321587, + "learning_rate": 0.00019179878061798347, + "loss": 0.8022, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5288507605466519, + "learning_rate": 0.00019173010768809933, + "loss": 0.9426, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.5862423688214016, + "learning_rate": 0.00019166116083819002, + "loss": 0.9661, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.45551657692460773, + "learning_rate": 0.00019159194027414128, + "loss": 0.8545, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4811193693958405, + "learning_rate": 0.0001915224462026563, + "loss": 0.7884, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.5258397858079111, + "learning_rate": 0.00019145267883125482, + "loss": 0.867, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.595162856717673, + "learning_rate": 0.00019138263836827288, + "loss": 0.831, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.5659986534420202, + "learning_rate": 0.00019131232502286188, + "loss": 0.7968, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.5868058207947794, + "learning_rate": 0.00019124173900498818, + "loss": 0.9463, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.549031007008094, + "learning_rate": 0.00019117088052543233, + "loss": 0.9249, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4388197544653247, + "learning_rate": 0.0001910997497957885, + "loss": 0.7764, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.5052690162443814, + "learning_rate": 0.00019102834702846387, + "loss": 0.8578, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.45750769573278793, + "learning_rate": 0.0001909566724366779, + "loss": 0.8042, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.42333752935950547, + "learning_rate": 0.00019088472623446183, + "loss": 0.6729, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.6361550342067476, + "learning_rate": 0.00019081250863665794, + "loss": 0.8955, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.5838815775367465, + "learning_rate": 0.0001907400198589189, + "loss": 0.919, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.47131326237580695, + "learning_rate": 0.00019066726011770726, + "loss": 0.8234, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.4939472774275297, + "learning_rate": 0.00019059422963029464, + "loss": 0.8204, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.4661191073013235, + "learning_rate": 0.0001905209286147611, + "loss": 0.8462, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5161016855471054, + "learning_rate": 0.0001904473572899947, + "loss": 0.8125, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.6230917896093047, + "learning_rate": 0.0001903735158756905, + "loss": 0.8682, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.5465885854054142, + "learning_rate": 0.0001902994045923502, + "loss": 0.8712, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.5838856241344086, + "learning_rate": 0.00019022502366128135, + "loss": 0.8742, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.49070003117787664, + "learning_rate": 0.0001901503733045967, + "loss": 0.7377, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.485262845687119, + "learning_rate": 0.00019007545374521355, + "loss": 0.8659, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4831813449914069, + "learning_rate": 0.00019000026520685302, + "loss": 0.8319, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.4581343977513169, + "learning_rate": 0.00018992480791403958, + "loss": 0.7677, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.5571624248436221, + "learning_rate": 0.0001898490820921001, + "loss": 0.7621, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.5849752225132492, + "learning_rate": 0.0001897730879671634, + "loss": 0.818, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.5608480928855067, + "learning_rate": 0.0001896968257661595, + "loss": 0.8772, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.6465737882703233, + "learning_rate": 0.00018962029571681886, + "loss": 0.8579, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5953323704630802, + "learning_rate": 0.00018954349804767184, + "loss": 0.9377, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.6159644380271597, + "learning_rate": 0.00018946643298804793, + "loss": 0.9118, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.4360808267460942, + "learning_rate": 0.00018938910076807513, + "loss": 0.7541, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.5513392870287326, + "learning_rate": 0.00018931150161867916, + "loss": 0.88, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.5327690536560286, + "learning_rate": 0.0001892336357715829, + "loss": 0.8506, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.4041086223362272, + "learning_rate": 0.0001891555034593055, + "loss": 0.8096, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.6228855606861909, + "learning_rate": 0.00018907710491516199, + "loss": 0.9362, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.611510846634051, + "learning_rate": 0.00018899844037326225, + "loss": 0.8662, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.48009499579003484, + "learning_rate": 0.0001889195100685106, + "loss": 0.7633, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.5541681054693345, + "learning_rate": 0.0001888403142366049, + "loss": 0.8379, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.5982894164030271, + "learning_rate": 0.00018876085311403593, + "loss": 0.9079, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.5548055035975625, + "learning_rate": 0.00018868112693808665, + "loss": 0.7873, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4727454212792718, + "learning_rate": 0.00018860113594683148, + "loss": 0.7872, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.5257309643502509, + "learning_rate": 0.00018852088037913577, + "loss": 0.8199, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.5360988450390519, + "learning_rate": 0.0001884403604746547, + "loss": 0.8156, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.4892954466148729, + "learning_rate": 0.00018835957647383303, + "loss": 0.811, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.5316589543705368, + "learning_rate": 0.00018827852861790398, + "loss": 0.7961, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.4370835791050171, + "learning_rate": 0.00018819721714888877, + "loss": 0.8066, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5555697668670965, + "learning_rate": 0.00018811564230959588, + "loss": 0.8289, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.4958795650773353, + "learning_rate": 0.00018803380434362, + "loss": 0.8921, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.4561120934623149, + "learning_rate": 0.0001879517034953418, + "loss": 0.7719, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.5831076390016625, + "learning_rate": 0.00018786934000992688, + "loss": 1.049, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.5067384623030481, + "learning_rate": 0.00018778671413332513, + "loss": 0.7914, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.564946409052129, + "learning_rate": 0.00018770382611226987, + "loss": 0.8551, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.491576477038357, + "learning_rate": 0.00018762067619427746, + "loss": 0.79, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.5572261155862809, + "learning_rate": 0.000187537264627646, + "loss": 0.8445, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.5212090244254038, + "learning_rate": 0.00018745359166145523, + "loss": 0.9127, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.4913901261761234, + "learning_rate": 0.00018736965754556528, + "loss": 0.7871, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.5241268418892111, + "learning_rate": 0.00018728546253061614, + "loss": 0.7065, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.42515391209663606, + "learning_rate": 0.00018720100686802694, + "loss": 0.7261, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.6581467658741232, + "learning_rate": 0.00018711629080999504, + "loss": 0.9722, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.5298306438086058, + "learning_rate": 0.00018703131460949554, + "loss": 0.9069, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.46546361098263916, + "learning_rate": 0.0001869460785202802, + "loss": 0.7564, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.485893984659004, + "learning_rate": 0.00018686058279687698, + "loss": 0.8271, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.5144090274852071, + "learning_rate": 0.00018677482769458904, + "loss": 0.8359, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.6949783760386742, + "learning_rate": 0.00018668881346949417, + "loss": 0.8699, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.6198190399280962, + "learning_rate": 0.00018660254037844388, + "loss": 0.8451, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.4401101529728473, + "learning_rate": 0.00018651600867906272, + "loss": 0.7306, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.6101096311868891, + "learning_rate": 0.00018642921862974742, + "loss": 0.8547, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.5377151040001709, + "learning_rate": 0.00018634217048966637, + "loss": 0.817, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.47658728416602897, + "learning_rate": 0.00018625486451875843, + "loss": 0.8621, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.579309769307992, + "learning_rate": 0.0001861673009777325, + "loss": 0.8925, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5376705048519224, + "learning_rate": 0.0001860794801280666, + "loss": 0.867, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.512744799308176, + "learning_rate": 0.00018599140223200716, + "loss": 0.7927, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.5994810647777205, + "learning_rate": 0.0001859030675525681, + "loss": 0.8483, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.44422235051039305, + "learning_rate": 0.0001858144763535302, + "loss": 0.8125, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.5180666707705607, + "learning_rate": 0.0001857256288994402, + "loss": 0.8957, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.5524856689864757, + "learning_rate": 0.00018563652545561013, + "loss": 0.924, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4702109494579363, + "learning_rate": 0.0001855471662881164, + "loss": 0.7929, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.5994746178925932, + "learning_rate": 0.000185457551663799, + "loss": 0.9002, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.47361831549663996, + "learning_rate": 0.00018536768185026083, + "loss": 0.8271, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.5930536559375674, + "learning_rate": 0.00018527755711586678, + "loss": 0.8387, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.5238430141281338, + "learning_rate": 0.00018518717772974302, + "loss": 0.8138, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.506429838380469, + "learning_rate": 0.00018509654396177609, + "loss": 0.8263, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5794579155651455, + "learning_rate": 0.00018500565608261214, + "loss": 0.8536, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.47506482339905026, + "learning_rate": 0.00018491451436365627, + "loss": 0.7853, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.450027524333986, + "learning_rate": 0.0001848231190770714, + "loss": 0.7821, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.557993144873831, + "learning_rate": 0.00018473147049577774, + "loss": 0.769, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.4578599686187814, + "learning_rate": 0.00018463956889345194, + "loss": 0.8607, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.5844386845197508, + "learning_rate": 0.00018454741454452603, + "loss": 0.8208, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5074206945564635, + "learning_rate": 0.00018445500772418697, + "loss": 0.78, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.44395521429628143, + "learning_rate": 0.00018436234870837547, + "loss": 0.7179, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.5100997833574008, + "learning_rate": 0.00018426943777378552, + "loss": 0.7766, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.454280185635574, + "learning_rate": 0.00018417627519786315, + "loss": 0.8336, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.5285292757673242, + "learning_rate": 0.00018408286125880604, + "loss": 0.8874, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.6344098140860316, + "learning_rate": 0.00018398919623556238, + "loss": 0.9041, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.5149194288974933, + "learning_rate": 0.00018389528040783012, + "loss": 0.857, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.5109486020277634, + "learning_rate": 0.0001838011140560562, + "loss": 0.8142, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.5235105155371217, + "learning_rate": 0.00018370669746143564, + "loss": 0.7875, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5503602375551576, + "learning_rate": 0.00018361203090591071, + "loss": 0.8122, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.5674118651197855, + "learning_rate": 0.0001835171146721701, + "loss": 0.7899, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.530104587343037, + "learning_rate": 0.00018342194904364813, + "loss": 0.7918, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5115579238706, + "learning_rate": 0.00018332653430452376, + "loss": 0.9503, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.6329129854824773, + "learning_rate": 0.00018323087073971993, + "loss": 0.8994, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.5377736255259186, + "learning_rate": 0.00018313495863490258, + "loss": 0.8565, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.5079375630365733, + "learning_rate": 0.00018303879827647975, + "loss": 0.9063, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.4573476129721287, + "learning_rate": 0.00018294238995160094, + "loss": 0.8598, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.49418943063174736, + "learning_rate": 0.00018284573394815597, + "loss": 0.7805, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.47243497761678455, + "learning_rate": 0.00018274883055477436, + "loss": 0.8249, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.4112186563303933, + "learning_rate": 0.00018265168006082437, + "loss": 0.7063, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4414313706621415, + "learning_rate": 0.00018255428275641214, + "loss": 0.7382, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.5761644030030776, + "learning_rate": 0.00018245663893238075, + "loss": 0.8861, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.7557863909743763, + "learning_rate": 0.0001823587488803095, + "loss": 0.8796, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.5402777514357807, + "learning_rate": 0.00018226061289251298, + "loss": 0.7842, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.48727149899937433, + "learning_rate": 0.00018216223126204007, + "loss": 0.791, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.44541974930624056, + "learning_rate": 0.00018206360428267332, + "loss": 0.7707, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.580246214808751, + "learning_rate": 0.00018196473224892784, + "loss": 0.8541, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.581307078232911, + "learning_rate": 0.00018186561545605054, + "loss": 0.8594, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.5708358247918823, + "learning_rate": 0.0001817662542000192, + "loss": 0.9647, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.4649649314795202, + "learning_rate": 0.0001816666487775416, + "loss": 0.7483, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.774756832425865, + "learning_rate": 0.00018156679948605467, + "loss": 1.0104, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.605587182068161, + "learning_rate": 0.00018146670662372354, + "loss": 0.8086, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.5472845405240865, + "learning_rate": 0.0001813663704894407, + "loss": 0.8232, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.5017489912343042, + "learning_rate": 0.00018126579138282503, + "loss": 0.8404, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.49818526027683374, + "learning_rate": 0.00018116496960422107, + "loss": 0.8592, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.689125839132402, + "learning_rate": 0.00018106390545469795, + "loss": 0.8359, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.6821308162692863, + "learning_rate": 0.0001809625992360485, + "loss": 0.8958, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.5804857693615908, + "learning_rate": 0.00018086105125078857, + "loss": 0.9093, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.46982462769170197, + "learning_rate": 0.00018075926180215576, + "loss": 0.792, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.541140037074842, + "learning_rate": 0.00018065723119410884, + "loss": 0.7495, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.622171168911818, + "learning_rate": 0.0001805549597313267, + "loss": 0.93, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.5569050502367818, + "learning_rate": 0.0001804524477192075, + "loss": 0.928, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.48875970515751554, + "learning_rate": 0.00018034969546386757, + "loss": 0.8371, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.5755493350394935, + "learning_rate": 0.00018024670327214084, + "loss": 0.8447, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.5583798722074151, + "learning_rate": 0.00018014347145157755, + "loss": 0.8703, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.5206717758140702, + "learning_rate": 0.0001800400003104436, + "loss": 0.7646, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.4885017057637796, + "learning_rate": 0.0001799362901577196, + "loss": 0.8477, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.5604629624899274, + "learning_rate": 0.00017983234130309968, + "loss": 0.9046, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.464216914383139, + "learning_rate": 0.00017972815405699103, + "loss": 0.7891, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.5746567610258846, + "learning_rate": 0.00017962372873051252, + "loss": 0.9133, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.42200089908841276, + "learning_rate": 0.00017951906563549397, + "loss": 0.7272, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.6165414978826635, + "learning_rate": 0.00017941416508447536, + "loss": 0.938, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.4702346524928013, + "learning_rate": 0.00017930902739070562, + "loss": 0.7541, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.4727315201073824, + "learning_rate": 0.00017920365286814183, + "loss": 0.6898, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4299838281148631, + "learning_rate": 0.0001790980418314484, + "loss": 0.758, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.44337457223936744, + "learning_rate": 0.0001789921945959958, + "loss": 0.7556, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.5329729018475009, + "learning_rate": 0.00017888611147786002, + "loss": 0.864, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5705008239613129, + "learning_rate": 0.00017877979279382135, + "loss": 0.8289, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.47263183096368583, + "learning_rate": 0.00017867323886136348, + "loss": 0.7253, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.45304333722800333, + "learning_rate": 0.00017856644999867264, + "loss": 0.7468, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5060938939140778, + "learning_rate": 0.0001784594265246366, + "loss": 0.7394, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.6564144054411251, + "learning_rate": 0.00017835216875884368, + "loss": 0.9074, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.42445989401827156, + "learning_rate": 0.0001782446770215819, + "loss": 0.739, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.5315627798908884, + "learning_rate": 0.0001781369516338378, + "loss": 0.7656, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.4688436730717214, + "learning_rate": 0.00017802899291729585, + "loss": 0.7674, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.5810720136539039, + "learning_rate": 0.0001779208011943371, + "loss": 0.8692, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.5812458391708225, + "learning_rate": 0.00017781237678803847, + "loss": 0.8987, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.4799125350467684, + "learning_rate": 0.00017770372002217172, + "loss": 0.7877, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.6412843712952934, + "learning_rate": 0.00017759483122120238, + "loss": 0.9124, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4507351816970884, + "learning_rate": 0.000177485710710289, + "loss": 0.7917, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.5232869881443669, + "learning_rate": 0.00017737635881528196, + "loss": 0.8898, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.6678830672310265, + "learning_rate": 0.00017726677586272263, + "loss": 0.9137, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5801502567328287, + "learning_rate": 0.00017715696217984235, + "loss": 0.8537, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.45841158112039687, + "learning_rate": 0.00017704691809456143, + "loss": 0.8383, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.6025432050942682, + "learning_rate": 0.0001769366439354882, + "loss": 0.8565, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.4970815494062268, + "learning_rate": 0.00017682614003191807, + "loss": 0.851, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.4518959864049737, + "learning_rate": 0.00017671540671383243, + "loss": 0.7825, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.46228300717288223, + "learning_rate": 0.0001766044443118978, + "loss": 0.7594, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4739273906053965, + "learning_rate": 0.00017649325315746478, + "loss": 0.7488, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.5092330687540366, + "learning_rate": 0.00017638183358256696, + "loss": 0.9223, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.5717462977076776, + "learning_rate": 0.00017627018591992018, + "loss": 0.8208, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.5863086266796643, + "learning_rate": 0.0001761583105029213, + "loss": 0.9333, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.41277279746009543, + "learning_rate": 0.00017604620766564723, + "loss": 0.7436, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.6106807286396574, + "learning_rate": 0.00017593387774285412, + "loss": 0.9381, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5677050966969265, + "learning_rate": 0.00017582132106997616, + "loss": 0.7547, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.5833341849301831, + "learning_rate": 0.0001757085379831246, + "loss": 0.8697, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.5290752412549475, + "learning_rate": 0.00017559552881908695, + "loss": 0.7354, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.5779717687931067, + "learning_rate": 0.00017548229391532572, + "loss": 0.7323, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.4392699173768945, + "learning_rate": 0.00017536883360997743, + "loss": 0.7517, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.5508248857407402, + "learning_rate": 0.00017525514824185185, + "loss": 0.8888, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.6781607961525851, + "learning_rate": 0.00017514123815043074, + "loss": 0.9684, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.4791095538129747, + "learning_rate": 0.00017502710367586687, + "loss": 0.7642, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.5166694348338413, + "learning_rate": 0.0001749127451589832, + "loss": 0.8259, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.3489532858102896, + "learning_rate": 0.00017479816294127152, + "loss": 0.6568, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.5245925867164768, + "learning_rate": 0.00017468335736489177, + "loss": 0.8743, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.44304976380850064, + "learning_rate": 0.00017456832877267084, + "loss": 0.8157, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.42422297872920073, + "learning_rate": 0.0001744530775081015, + "loss": 0.7929, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.7557404944246874, + "learning_rate": 0.00017433760391534167, + "loss": 0.9597, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.48485914445666506, + "learning_rate": 0.00017422190833921283, + "loss": 0.739, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5039331423519677, + "learning_rate": 0.0001741059911251997, + "loss": 0.858, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.7303548271504826, + "learning_rate": 0.00017398985261944856, + "loss": 0.8248, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.6772369751239833, + "learning_rate": 0.00017387349316876666, + "loss": 0.8311, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5187610284586263, + "learning_rate": 0.000173756913120621, + "loss": 0.8564, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.464388798089011, + "learning_rate": 0.0001736401128231373, + "loss": 0.7823, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.5359802209397111, + "learning_rate": 0.00017352309262509894, + "loss": 0.8423, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.556993993005146, + "learning_rate": 0.00017340585287594604, + "loss": 0.8595, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.41901610477892903, + "learning_rate": 0.0001732883939257742, + "loss": 0.8083, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.5102021415665858, + "learning_rate": 0.0001731707161253338, + "loss": 0.8268, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5754800536381837, + "learning_rate": 0.0001730528198260285, + "loss": 0.821, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.48205165510631953, + "learning_rate": 0.00017293470537991463, + "loss": 0.8306, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.5247804469877534, + "learning_rate": 0.00017281637313969978, + "loss": 0.7947, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.520546460368178, + "learning_rate": 0.00017269782345874203, + "loss": 0.809, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.41373617382203987, + "learning_rate": 0.00017257905669104874, + "loss": 0.7432, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.44134956144894705, + "learning_rate": 0.00017246007319127545, + "loss": 0.7787, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.475468089448575, + "learning_rate": 0.00017234087331472497, + "loss": 0.7971, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.6027268130260609, + "learning_rate": 0.00017222145741734626, + "loss": 0.8426, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.46290862472857397, + "learning_rate": 0.00017210182585573327, + "loss": 0.7634, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.566108444859895, + "learning_rate": 0.00017198197898712404, + "loss": 0.8173, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.5507765253343672, + "learning_rate": 0.00017186191716939944, + "loss": 0.8316, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.40469652703742315, + "learning_rate": 0.0001717416407610824, + "loss": 0.7054, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.48157658854253194, + "learning_rate": 0.00017162115012133643, + "loss": 0.8114, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.4801546623719645, + "learning_rate": 0.00017150044560996488, + "loss": 0.775, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.4301332412661433, + "learning_rate": 0.00017137952758740978, + "loss": 0.7568, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5135811751297393, + "learning_rate": 0.00017125839641475072, + "loss": 0.844, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.5221443518980844, + "learning_rate": 0.00017113705245370368, + "loss": 0.8096, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.6576443077268289, + "learning_rate": 0.00017101549606662024, + "loss": 0.9185, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.6650147294272072, + "learning_rate": 0.00017089372761648616, + "loss": 0.8357, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.5463577951199781, + "learning_rate": 0.00017077174746692056, + "loss": 0.8872, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.4984353756271598, + "learning_rate": 0.00017064955598217462, + "loss": 0.7617, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4616081485174494, + "learning_rate": 0.00017052715352713075, + "loss": 0.7243, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.5084453619371878, + "learning_rate": 0.00017040454046730115, + "loss": 0.7767, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.626500513697084, + "learning_rate": 0.00017028171716882714, + "loss": 0.9369, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4529032013789942, + "learning_rate": 0.00017015868399847768, + "loss": 0.7285, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.5515420159752935, + "learning_rate": 0.00017003544132364846, + "loss": 0.8331, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.49116840088614994, + "learning_rate": 0.00016991198951236088, + "loss": 0.8529, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.6553757063449491, + "learning_rate": 0.00016978832893326074, + "loss": 0.8676, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.5824943501067116, + "learning_rate": 0.00016966445995561727, + "loss": 0.8433, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.5497170047766405, + "learning_rate": 0.00016954038294932216, + "loss": 0.8439, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5140969276726812, + "learning_rate": 0.00016941609828488807, + "loss": 0.8009, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.48686499946393014, + "learning_rate": 0.0001692916063334479, + "loss": 0.8416, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.5168470295829324, + "learning_rate": 0.0001691669074667535, + "loss": 0.8766, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.6026993798612537, + "learning_rate": 0.0001690420020571747, + "loss": 0.8608, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.48750661458077205, + "learning_rate": 0.0001689168904776979, + "loss": 0.8569, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.5205322822608496, + "learning_rate": 0.00016879157310192535, + "loss": 0.8943, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.45029603832584264, + "learning_rate": 0.0001686660503040737, + "loss": 0.7671, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.7667010384373698, + "learning_rate": 0.00016854032245897308, + "loss": 0.936, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.6808760932653368, + "learning_rate": 0.00016841438994206595, + "loss": 0.9906, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.5107596500012404, + "learning_rate": 0.00016828825312940592, + "loss": 0.8291, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.615128336771932, + "learning_rate": 0.00016816191239765667, + "loss": 0.8414, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.4972186646105337, + "learning_rate": 0.00016803536812409075, + "loss": 0.7722, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5552812456187501, + "learning_rate": 0.0001679086206865886, + "loss": 0.8224, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.5412211364871428, + "learning_rate": 0.00016778167046363734, + "loss": 0.864, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.47978554124338063, + "learning_rate": 0.00016765451783432953, + "loss": 0.8055, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4962602604272398, + "learning_rate": 0.00016752716317836229, + "loss": 0.8053, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.48090103519226757, + "learning_rate": 0.0001673996068760359, + "loss": 0.7141, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.5189967882380148, + "learning_rate": 0.00016727184930825288, + "loss": 0.8366, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.46111912936807337, + "learning_rate": 0.0001671438908565167, + "loss": 0.785, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.5539541504876097, + "learning_rate": 0.00016701573190293077, + "loss": 0.8262, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.5247755483401154, + "learning_rate": 0.00016688737283019706, + "loss": 0.8989, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.47242081364961513, + "learning_rate": 0.00016675881402161536, + "loss": 0.7879, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.5696691564826674, + "learning_rate": 0.00016663005586108176, + "loss": 0.9036, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.5620281936198633, + "learning_rate": 0.00016650109873308765, + "loss": 0.8152, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4672591331363155, + "learning_rate": 0.0001663719430227186, + "loss": 0.7788, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.6262792002322631, + "learning_rate": 0.0001662425891156531, + "loss": 0.8908, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.518269709369013, + "learning_rate": 0.00016611303739816168, + "loss": 0.8044, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.39337241146210455, + "learning_rate": 0.00016598328825710533, + "loss": 0.6769, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.5417690530616373, + "learning_rate": 0.00016585334207993476, + "loss": 0.848, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 1.187369023455772, + "learning_rate": 0.00016572319925468892, + "loss": 0.8774, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5423071451639491, + "learning_rate": 0.000165592860169994, + "loss": 0.7612, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.5084989532325404, + "learning_rate": 0.0001654623252150624, + "loss": 0.845, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.5604399463617918, + "learning_rate": 0.00016533159477969122, + "loss": 0.7828, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.5264007894818428, + "learning_rate": 0.00016520066925426144, + "loss": 0.8529, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.37714368324182157, + "learning_rate": 0.00016506954902973655, + "loss": 0.7372, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.4490152569575573, + "learning_rate": 0.00016493823449766136, + "loss": 0.7629, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.5242042096888933, + "learning_rate": 0.0001648067260501611, + "loss": 0.8096, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.7189712158273117, + "learning_rate": 0.00016467502407993992, + "loss": 0.933, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.5567176210351147, + "learning_rate": 0.0001645431289802799, + "loss": 0.8033, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.6606946387299742, + "learning_rate": 0.0001644110411450398, + "loss": 0.8288, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.5674771200484571, + "learning_rate": 0.00016427876096865394, + "loss": 0.9467, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.4934615713701951, + "learning_rate": 0.00016414628884613107, + "loss": 0.8295, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.6417839110950121, + "learning_rate": 0.00016401362517305296, + "loss": 0.818, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.5258462179062289, + "learning_rate": 0.00016388077034557355, + "loss": 0.8301, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.6337368767683507, + "learning_rate": 0.00016374772476041748, + "loss": 0.8649, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4598815827476441, + "learning_rate": 0.00016361448881487914, + "loss": 0.8378, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.45326439680218955, + "learning_rate": 0.00016348106290682118, + "loss": 0.7619, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.47063645637459545, + "learning_rate": 0.00016334744743467364, + "loss": 0.7845, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.4463096527646349, + "learning_rate": 0.00016321364279743266, + "loss": 0.7873, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.5566967859146053, + "learning_rate": 0.00016307964939465914, + "loss": 0.8291, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.5026777227425168, + "learning_rate": 0.00016294546762647775, + "loss": 0.8067, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.45265094263269107, + "learning_rate": 0.0001628110978935756, + "loss": 0.7061, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.518154942694519, + "learning_rate": 0.0001626765405972011, + "loss": 0.8052, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.671606637480916, + "learning_rate": 0.00016254179613916278, + "loss": 0.8991, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4264578310653122, + "learning_rate": 0.00016240686492182804, + "loss": 0.7505, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.5875387971076562, + "learning_rate": 0.000162271747348122, + "loss": 0.888, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.6559006975934076, + "learning_rate": 0.0001621364438215262, + "loss": 0.995, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.5510152039312369, + "learning_rate": 0.00016200095474607753, + "loss": 0.8637, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.6094500810226675, + "learning_rate": 0.00016186528052636692, + "loss": 0.8572, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.5620392199625313, + "learning_rate": 0.0001617294215675382, + "loss": 0.8427, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4799019641814468, + "learning_rate": 0.00016159337827528685, + "loss": 0.883, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.6367837218131216, + "learning_rate": 0.0001614571510558588, + "loss": 0.8408, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.46555585537157745, + "learning_rate": 0.00016132074031604917, + "loss": 0.7537, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.5013403235334765, + "learning_rate": 0.0001611841464632011, + "loss": 0.8318, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.5632794940134113, + "learning_rate": 0.00016104736990520468, + "loss": 0.9249, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.6051919782229207, + "learning_rate": 0.0001609104110504954, + "loss": 0.9439, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.5289845126344039, + "learning_rate": 0.0001607732703080532, + "loss": 0.7709, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.6319831806991704, + "learning_rate": 0.00016063594808740113, + "loss": 0.8988, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.5270325735548156, + "learning_rate": 0.00016049844479860422, + "loss": 0.8349, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.610773726540534, + "learning_rate": 0.00016036076085226814, + "loss": 0.9001, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.5316834587247324, + "learning_rate": 0.00016022289665953808, + "loss": 0.7978, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.5118701114233776, + "learning_rate": 0.00016008485263209742, + "loss": 0.7495, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.6208567717250233, + "learning_rate": 0.0001599466291821666, + "loss": 0.9326, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.5471864762151855, + "learning_rate": 0.0001598082267225018, + "loss": 0.8352, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.5276315900469803, + "learning_rate": 0.0001596696456663938, + "loss": 0.7815, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.4721223348277774, + "learning_rate": 0.0001595308864276666, + "loss": 0.7736, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.46873833958214894, + "learning_rate": 0.00015939194942067646, + "loss": 0.7499, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.529810506600649, + "learning_rate": 0.0001592528350603103, + "loss": 0.8143, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.44327375649757317, + "learning_rate": 0.0001591135437619847, + "loss": 0.8177, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.46649303987907914, + "learning_rate": 0.00015897407594164467, + "loss": 0.737, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.5574604609860693, + "learning_rate": 0.00015883443201576225, + "loss": 0.8278, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.42663942892965007, + "learning_rate": 0.0001586946124013354, + "loss": 0.675, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.45701736799284803, + "learning_rate": 0.00015855461751588677, + "loss": 0.7096, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.4720496840763521, + "learning_rate": 0.0001584144477774623, + "loss": 0.7855, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5063091619828909, + "learning_rate": 0.0001582741036046301, + "loss": 0.8439, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.49331665539251424, + "learning_rate": 0.00015813358541647915, + "loss": 0.8041, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.6465649336061632, + "learning_rate": 0.00015799289363261813, + "loss": 0.9504, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.49017445195332465, + "learning_rate": 0.00015785202867317407, + "loss": 0.8274, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.5478699688037733, + "learning_rate": 0.00015771099095879108, + "loss": 0.8629, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.532454452552899, + "learning_rate": 0.0001575697809106292, + "loss": 0.8421, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.536805449004451, + "learning_rate": 0.00015742839895036305, + "loss": 0.7848, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.5215712291897809, + "learning_rate": 0.00015728684550018064, + "loss": 0.8874, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.5487771288459847, + "learning_rate": 0.0001571451209827821, + "loss": 0.8961, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.5116161086365358, + "learning_rate": 0.00015700322582137827, + "loss": 0.8396, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.48924499411970224, + "learning_rate": 0.00015686116043968972, + "loss": 0.7564, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.5364015158108658, + "learning_rate": 0.00015671892526194516, + "loss": 0.7626, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4663661313576595, + "learning_rate": 0.0001565765207128805, + "loss": 0.6755, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.4761060519561345, + "learning_rate": 0.0001564339472177373, + "loss": 0.7709, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.46693843751676056, + "learning_rate": 0.00015629120520226165, + "loss": 0.7818, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.5503320372672844, + "learning_rate": 0.0001561482950927029, + "loss": 0.8936, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.609742122470184, + "learning_rate": 0.0001560052173158123, + "loss": 0.8283, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.4786888058526543, + "learning_rate": 0.00015586197229884184, + "loss": 0.7054, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5468961137967208, + "learning_rate": 0.00015571856046954285, + "loss": 0.8858, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5079561568537743, + "learning_rate": 0.00015557498225616487, + "loss": 0.7851, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.4686408474857009, + "learning_rate": 0.0001554312380874542, + "loss": 0.7707, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5996669398126965, + "learning_rate": 0.00015528732839265272, + "loss": 0.7502, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.4449647185867954, + "learning_rate": 0.00015514325360149668, + "loss": 0.7447, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.4834378503665272, + "learning_rate": 0.0001549990141442153, + "loss": 0.8161, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.5402678542348858, + "learning_rate": 0.0001548546104515294, + "loss": 0.7989, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.4463779543248393, + "learning_rate": 0.00015471004295465035, + "loss": 0.7468, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.49253680351793444, + "learning_rate": 0.0001545653120852787, + "loss": 0.859, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5474150764496007, + "learning_rate": 0.00015442041827560274, + "loss": 0.827, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.5574422779343221, + "learning_rate": 0.00015427536195829742, + "loss": 0.8604, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.4531471454712154, + "learning_rate": 0.00015413014356652286, + "loss": 0.8479, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.8027928220061923, + "learning_rate": 0.00015398476353392323, + "loss": 0.8997, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.5309692551303301, + "learning_rate": 0.00015383922229462549, + "loss": 0.82, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.5254409038796218, + "learning_rate": 0.00015369352028323774, + "loss": 0.8391, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.48868712676371545, + "learning_rate": 0.00015354765793484834, + "loss": 0.7736, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.5746536982609687, + "learning_rate": 0.0001534016356850244, + "loss": 0.964, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.5663542278344792, + "learning_rate": 0.0001532554539698105, + "loss": 0.79, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.7322489622484162, + "learning_rate": 0.00015310911322572753, + "loss": 1.0124, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.570448953372935, + "learning_rate": 0.00015296261388977108, + "loss": 0.8195, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.5244050729061402, + "learning_rate": 0.0001528159563994104, + "loss": 0.7873, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.563009538878919, + "learning_rate": 0.000152669141192587, + "loss": 0.7979, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.47992530098772557, + "learning_rate": 0.00015252216870771345, + "loss": 0.7757, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.5488372438427589, + "learning_rate": 0.00015237503938367186, + "loss": 0.8166, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.8745056416840453, + "learning_rate": 0.00015222775365981273, + "loss": 0.8564, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.5704675748450467, + "learning_rate": 0.00015208031197595356, + "loss": 0.7455, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.5079628164978088, + "learning_rate": 0.0001519327147723776, + "loss": 0.7786, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.46045188551189115, + "learning_rate": 0.00015178496248983254, + "loss": 0.7682, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.5642003124117353, + "learning_rate": 0.0001516370555695291, + "loss": 0.8237, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.45036166419602947, + "learning_rate": 0.00015148899445313981, + "loss": 0.7517, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.62770242511897, + "learning_rate": 0.00015134077958279765, + "loss": 0.9288, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.5286171186931936, + "learning_rate": 0.00015119241140109467, + "loss": 0.7778, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.6078858094826769, + "learning_rate": 0.00015104389035108077, + "loss": 0.8137, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.428744760977173, + "learning_rate": 0.00015089521687626243, + "loss": 0.7799, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.6999273455997498, + "learning_rate": 0.0001507463914206012, + "loss": 0.8146, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.5741621314582245, + "learning_rate": 0.0001505974144285124, + "loss": 0.868, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.5124125389535803, + "learning_rate": 0.000150448286344864, + "loss": 0.868, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.4881782705813716, + "learning_rate": 0.00015029900761497506, + "loss": 0.8667, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.53066060155896, + "learning_rate": 0.00015014957868461458, + "loss": 0.8457, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.5591736195185854, + "learning_rate": 0.00015000000000000001, + "loss": 0.8802, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.4905697116884893, + "learning_rate": 0.000149850272007796, + "loss": 0.7155, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.5844276562874432, + "learning_rate": 0.00014970039515511304, + "loss": 0.9422, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.603290743417517, + "learning_rate": 0.00014955036988950618, + "loss": 0.8599, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.5088283751229241, + "learning_rate": 0.0001494001966589736, + "loss": 0.7846, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.531314508584258, + "learning_rate": 0.00014924987591195547, + "loss": 0.7477, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.48487488648848864, + "learning_rate": 0.00014909940809733222, + "loss": 0.7756, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.46300148178031775, + "learning_rate": 0.0001489487936644237, + "loss": 0.8847, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.4817338737680085, + "learning_rate": 0.00014879803306298736, + "loss": 0.7519, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5697214418512658, + "learning_rate": 0.00014864712674321734, + "loss": 0.7859, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.6148209323459753, + "learning_rate": 0.00014849607515574276, + "loss": 0.818, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.5003433116216678, + "learning_rate": 0.00014834487875162657, + "loss": 0.8501, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.5432992354727689, + "learning_rate": 0.00014819353798236427, + "loss": 0.8104, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.5083729917715238, + "learning_rate": 0.00014804205329988225, + "loss": 0.7957, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.46631811126979217, + "learning_rate": 0.00014789042515653687, + "loss": 0.748, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4274038453696366, + "learning_rate": 0.00014773865400511272, + "loss": 0.6714, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.4642123927377642, + "learning_rate": 0.00014758674029882152, + "loss": 0.7946, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.5682266655302921, + "learning_rate": 0.00014743468449130063, + "loss": 0.893, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.45219212372742434, + "learning_rate": 0.00014728248703661182, + "loss": 0.7013, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.46249946521024066, + "learning_rate": 0.00014713014838923976, + "loss": 0.8187, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.5362530489588935, + "learning_rate": 0.00014697766900409074, + "loss": 0.9251, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.47112151670674485, + "learning_rate": 0.00014682504933649144, + "loss": 0.8204, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.43562242348644264, + "learning_rate": 0.0001466722898421873, + "loss": 0.817, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.6017771725031066, + "learning_rate": 0.0001465193909773413, + "loss": 0.798, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5331535348236007, + "learning_rate": 0.00014636635319853275, + "loss": 0.8401, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.5717651781716162, + "learning_rate": 0.00014621317696275564, + "loss": 0.9325, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.526960588209441, + "learning_rate": 0.00014605986272741748, + "loss": 0.7866, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.5253453194212678, + "learning_rate": 0.00014590641095033787, + "loss": 0.8615, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.5285391045903164, + "learning_rate": 0.00014575282208974702, + "loss": 0.8502, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.4159020717447715, + "learning_rate": 0.00014559909660428468, + "loss": 0.7361, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.47293504331080716, + "learning_rate": 0.00014544523495299842, + "loss": 0.8081, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.6337490496519959, + "learning_rate": 0.00014529123759534255, + "loss": 0.8865, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.5802499413114943, + "learning_rate": 0.00014513710499117647, + "loss": 0.8497, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4176215842349179, + "learning_rate": 0.0001449828376007636, + "loss": 0.6968, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.4825865010446155, + "learning_rate": 0.00014482843588476974, + "loss": 0.8353, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.5184616712009764, + "learning_rate": 0.00014467390030426186, + "loss": 0.7453, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.5336813961952697, + "learning_rate": 0.0001445192313207067, + "loss": 0.8843, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.5713473772631764, + "learning_rate": 0.0001443644293959693, + "loss": 0.8288, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.6517004215524038, + "learning_rate": 0.00014420949499231172, + "loss": 1.0056, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.564644177450356, + "learning_rate": 0.0001440544285723915, + "loss": 0.8924, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.6382855123267954, + "learning_rate": 0.00014389923059926062, + "loss": 0.9, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.52168540914425, + "learning_rate": 0.0001437439015363638, + "loss": 0.7179, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.9199605405327886, + "learning_rate": 0.00014358844184753712, + "loss": 0.8602, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.6630619697723, + "learning_rate": 0.00014343285199700683, + "loss": 0.9434, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.453200819137961, + "learning_rate": 0.0001432771324493879, + "loss": 0.7443, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5373021365917932, + "learning_rate": 0.00014312128366968243, + "loss": 0.8585, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.6638608339620459, + "learning_rate": 0.00014296530612327863, + "loss": 0.8807, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.5852792040916357, + "learning_rate": 0.00014280920027594907, + "loss": 0.8623, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.6045019203606804, + "learning_rate": 0.00014265296659384956, + "loss": 0.8099, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.5578165086402765, + "learning_rate": 0.00014249660554351752, + "loss": 0.8561, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.5135089124962354, + "learning_rate": 0.00014234011759187083, + "loss": 0.822, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.5229531767262776, + "learning_rate": 0.00014218350320620624, + "loss": 0.8289, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.5760480824148356, + "learning_rate": 0.00014202676285419812, + "loss": 0.9167, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.530069512579078, + "learning_rate": 0.00014186989700389687, + "loss": 0.7878, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5483414880778632, + "learning_rate": 0.0001417129061237278, + "loss": 0.7549, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.5805853791465041, + "learning_rate": 0.0001415557906824895, + "loss": 0.821, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.4120710990268928, + "learning_rate": 0.00014139855114935252, + "loss": 0.6579, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.5476826539300194, + "learning_rate": 0.00014124118799385796, + "loss": 0.8163, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.6855726624547981, + "learning_rate": 0.0001410837016859161, + "loss": 0.8886, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.4900184992286336, + "learning_rate": 0.00014092609269580496, + "loss": 0.7316, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.46367309796918, + "learning_rate": 0.00014076836149416887, + "loss": 0.8123, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.5535492820939049, + "learning_rate": 0.00014061050855201723, + "loss": 0.8137, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.6416401917887545, + "learning_rate": 0.0001404525343407228, + "loss": 0.8032, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.46814921453479513, + "learning_rate": 0.0001402944393320206, + "loss": 0.719, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.42856462107175863, + "learning_rate": 0.00014013622399800627, + "loss": 0.7507, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.5056120515390788, + "learning_rate": 0.00013997788881113489, + "loss": 0.8359, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4700390756542135, + "learning_rate": 0.00013981943424421932, + "loss": 0.7485, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.6086509832803118, + "learning_rate": 0.0001396608607704289, + "loss": 0.8363, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.5078168467062275, + "learning_rate": 0.0001395021688632882, + "loss": 0.8616, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.49783676941608546, + "learning_rate": 0.00013934335899667527, + "loss": 0.7553, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.49272398424024877, + "learning_rate": 0.00013918443164482046, + "loss": 0.7754, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.49484045111362585, + "learning_rate": 0.000139025387282305, + "loss": 0.7436, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.5297393470406898, + "learning_rate": 0.00013886622638405952, + "loss": 0.8124, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.6179185544593533, + "learning_rate": 0.0001387069494253626, + "loss": 0.8115, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.5373045261507816, + "learning_rate": 0.0001385475568818394, + "loss": 0.7461, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.45127402949937356, + "learning_rate": 0.00013838804922946027, + "loss": 0.7444, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.5181242897858944, + "learning_rate": 0.00013822842694453924, + "loss": 0.8767, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.5655969062462569, + "learning_rate": 0.0001380686905037327, + "loss": 0.85, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.5168015972212235, + "learning_rate": 0.00013790884038403795, + "loss": 0.7222, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.5531154877149115, + "learning_rate": 0.00013774887706279165, + "loss": 0.8093, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.519075203089709, + "learning_rate": 0.0001375888010176686, + "loss": 0.7641, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.5481275811684683, + "learning_rate": 0.00013742861272668012, + "loss": 0.8409, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.5239988375125255, + "learning_rate": 0.00013726831266817278, + "loss": 0.8484, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.5181888297872964, + "learning_rate": 0.00013710790132082692, + "loss": 0.8188, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.5057049279499224, + "learning_rate": 0.00013694737916365517, + "loss": 0.8139, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.5086212837186439, + "learning_rate": 0.00013678674667600102, + "loss": 0.807, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.5024752340256642, + "learning_rate": 0.00013662600433753745, + "loss": 0.9308, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.4089641077527304, + "learning_rate": 0.00013646515262826552, + "loss": 0.6833, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.4351236747525622, + "learning_rate": 0.00013630419202851284, + "loss": 0.803, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.49002818471326925, + "learning_rate": 0.00013614312301893223, + "loss": 0.7913, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.4466890684514906, + "learning_rate": 0.0001359819460805001, + "loss": 0.7246, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.48094890037933113, + "learning_rate": 0.00013582066169451535, + "loss": 0.7866, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.5754215224270137, + "learning_rate": 0.0001356592703425976, + "loss": 0.8077, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5090030089273849, + "learning_rate": 0.0001354977725066859, + "loss": 0.7786, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.4891915865862635, + "learning_rate": 0.00013533616866903735, + "loss": 0.765, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.46155122526575937, + "learning_rate": 0.0001351744593122255, + "loss": 0.7049, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4841794153051128, + "learning_rate": 0.00013501264491913906, + "loss": 0.7862, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.6634877800731043, + "learning_rate": 0.00013485072597298038, + "loss": 0.8723, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.5370652892356707, + "learning_rate": 0.00013468870295726398, + "loss": 0.7755, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4801178316957006, + "learning_rate": 0.0001345265763558152, + "loss": 0.7923, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.5019134348868551, + "learning_rate": 0.00013436434665276865, + "loss": 0.7057, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.46813690102209793, + "learning_rate": 0.00013420201433256689, + "loss": 0.7459, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.5398079540801454, + "learning_rate": 0.00013403957987995882, + "loss": 0.7864, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.4976426982944783, + "learning_rate": 0.00013387704377999842, + "loss": 0.8075, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.46870292537268243, + "learning_rate": 0.00013371440651804313, + "loss": 0.6822, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.545783509190825, + "learning_rate": 0.0001335516685797525, + "loss": 0.7946, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.5344080341027531, + "learning_rate": 0.00013338883045108674, + "loss": 0.7631, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.6977326518743707, + "learning_rate": 0.00013322589261830517, + "loss": 0.8548, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3904121194482292, + "learning_rate": 0.00013306285556796495, + "loss": 0.6869, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.4579030171134873, + "learning_rate": 0.0001328997197869194, + "loss": 0.7054, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.5432445941796074, + "learning_rate": 0.0001327364857623168, + "loss": 0.8541, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4507607056080124, + "learning_rate": 0.00013257315398159864, + "loss": 0.7294, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.5770144319792957, + "learning_rate": 0.00013240972493249847, + "loss": 0.7824, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.6222383621338347, + "learning_rate": 0.0001322461991030402, + "loss": 0.9508, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.5083213398241904, + "learning_rate": 0.00013208257698153677, + "loss": 0.7967, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.5769391778626363, + "learning_rate": 0.00013191885905658872, + "loss": 0.8215, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.4753318829986412, + "learning_rate": 0.0001317550458170826, + "loss": 0.8173, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.5848923979207764, + "learning_rate": 0.00013159113775218964, + "loss": 0.8812, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.5506274044347146, + "learning_rate": 0.00013142713535136414, + "loss": 0.6753, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.48945492331007395, + "learning_rate": 0.00013126303910434214, + "loss": 0.8095, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.47922500251569633, + "learning_rate": 0.00013109884950114007, + "loss": 0.7426, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.4474830678952825, + "learning_rate": 0.00013093456703205288, + "loss": 0.71, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.4901138815648903, + "learning_rate": 0.00013077019218765305, + "loss": 0.7971, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.46553748999705974, + "learning_rate": 0.00013060572545878875, + "loss": 0.7039, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.6818603988852794, + "learning_rate": 0.0001304411673365826, + "loss": 0.8921, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.5970769694778394, + "learning_rate": 0.0001302765183124302, + "loss": 0.9815, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.47872069397262745, + "learning_rate": 0.00013011177887799845, + "loss": 0.7985, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.3861546598029685, + "learning_rate": 0.00012994694952522435, + "loss": 0.6453, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.4603594590690842, + "learning_rate": 0.00012978203074631334, + "loss": 0.685, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.6089557164729323, + "learning_rate": 0.00012961702303373795, + "loss": 0.8086, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.603704990267899, + "learning_rate": 0.00012945192688023624, + "loss": 0.9752, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.7537321811367974, + "learning_rate": 0.0001292867427788104, + "loss": 0.9012, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.4930352911616266, + "learning_rate": 0.00012912147122272523, + "loss": 0.8116, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.4315763050222387, + "learning_rate": 0.00012895611270550666, + "loss": 0.7109, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.4637858942591107, + "learning_rate": 0.0001287906677209403, + "loss": 0.7524, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4676651569490796, + "learning_rate": 0.00012862513676307008, + "loss": 0.7661, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.4897362982883825, + "learning_rate": 0.0001284595203261965, + "loss": 0.7982, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.6143052681462965, + "learning_rate": 0.00012829381890487536, + "loss": 0.7956, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.5266193218351073, + "learning_rate": 0.00012812803299391628, + "loss": 0.776, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.5450427556760508, + "learning_rate": 0.00012796216308838117, + "loss": 0.8631, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.6456045728698774, + "learning_rate": 0.00012779620968358273, + "loss": 0.925, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.46905239069900634, + "learning_rate": 0.00012763017327508305, + "loss": 0.7524, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.5991620705144164, + "learning_rate": 0.00012746405435869198, + "loss": 0.7948, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.40463001077070404, + "learning_rate": 0.00012729785343046588, + "loss": 0.7792, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.5473140968009028, + "learning_rate": 0.0001271315709867059, + "loss": 0.7967, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.4846464786438877, + "learning_rate": 0.00012696520752395672, + "loss": 0.7561, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.49235137719971495, + "learning_rate": 0.00012679876353900482, + "loss": 0.7865, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.48082981780816214, + "learning_rate": 0.00012663223952887723, + "loss": 0.7826, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.45705357979137046, + "learning_rate": 0.00012646563599083996, + "loss": 0.7237, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.6079835635983911, + "learning_rate": 0.00012629895342239643, + "loss": 0.8438, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4697357339508937, + "learning_rate": 0.00012613219232128608, + "loss": 0.7754, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.4341312917563034, + "learning_rate": 0.00012596535318548289, + "loss": 0.7538, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.49600772082635314, + "learning_rate": 0.0001257984365131938, + "loss": 0.7376, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.5598019054150565, + "learning_rate": 0.00012563144280285741, + "loss": 0.8522, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.3921679797595081, + "learning_rate": 0.00012546437255314222, + "loss": 0.7071, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.586665310853696, + "learning_rate": 0.0001252972262629454, + "loss": 0.8259, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.6436832286186337, + "learning_rate": 0.00012513000443139112, + "loss": 0.8968, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.4532961689343766, + "learning_rate": 0.00012496270755782914, + "loss": 0.7185, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.6054960827856545, + "learning_rate": 0.00012479533614183334, + "loss": 0.842, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.5145811623028095, + "learning_rate": 0.00012462789068320017, + "loss": 0.8141, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.5784054764016591, + "learning_rate": 0.00012446037168194714, + "loss": 0.7729, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.7220645660369499, + "learning_rate": 0.00012429277963831148, + "loss": 0.8989, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.4980071811549346, + "learning_rate": 0.00012412511505274844, + "loss": 0.8106, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.5348419778871377, + "learning_rate": 0.00012395737842592995, + "loss": 0.8369, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.5353879738979862, + "learning_rate": 0.000123789570258743, + "loss": 0.7932, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.6651689873960853, + "learning_rate": 0.00012362169105228826, + "loss": 0.7473, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.5447327142112047, + "learning_rate": 0.00012345374130787854, + "loss": 0.8328, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.5413161115653148, + "learning_rate": 0.00012328572152703725, + "loss": 0.7804, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5663918612560117, + "learning_rate": 0.000123117632211497, + "loss": 0.8009, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.5940970783318418, + "learning_rate": 0.00012294947386319794, + "loss": 0.9107, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.4615580956786893, + "learning_rate": 0.0001227812469842864, + "loss": 0.7274, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.47289010215546234, + "learning_rate": 0.00012261295207711346, + "loss": 0.7662, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.5282870098169392, + "learning_rate": 0.00012244458964423327, + "loss": 0.791, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.7165095897507946, + "learning_rate": 0.00012227616018840154, + "loss": 0.9146, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.51113905343607, + "learning_rate": 0.0001221076642125742, + "loss": 0.8546, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.5025443898392142, + "learning_rate": 0.00012193910221990581, + "loss": 0.747, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.48667988939210144, + "learning_rate": 0.00012177047471374807, + "loss": 0.827, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.547760882436182, + "learning_rate": 0.00012160178219764837, + "loss": 0.8357, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.5514487469145651, + "learning_rate": 0.0001214330251753481, + "loss": 0.8204, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.5454077540745722, + "learning_rate": 0.00012126420415078132, + "loss": 0.81, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.5471013858109142, + "learning_rate": 0.00012109531962807332, + "loss": 0.8675, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.4681508026598542, + "learning_rate": 0.00012092637211153885, + "loss": 0.7791, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.5056140317477327, + "learning_rate": 0.0001207573621056809, + "loss": 0.7927, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.6032147508403578, + "learning_rate": 0.00012058829011518896, + "loss": 0.8263, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.5495111420104595, + "learning_rate": 0.00012041915664493761, + "loss": 0.8267, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.40949344338288285, + "learning_rate": 0.00012024996219998517, + "loss": 0.7962, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.4485925074166394, + "learning_rate": 0.00012008070728557186, + "loss": 0.7756, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.4345634842782298, + "learning_rate": 0.00011991139240711857, + "loss": 0.7192, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.5809157009620177, + "learning_rate": 0.00011974201807022525, + "loss": 0.6909, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.5158782652435949, + "learning_rate": 0.00011957258478066931, + "loss": 0.7563, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.5279338422076013, + "learning_rate": 0.00011940309304440433, + "loss": 0.8242, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.570446810953975, + "learning_rate": 0.00011923354336755835, + "loss": 0.8387, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.5039590872605328, + "learning_rate": 0.00011906393625643244, + "loss": 0.7736, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.46931152350354255, + "learning_rate": 0.00011889427221749916, + "loss": 0.7721, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.5370225410341043, + "learning_rate": 0.00011872455175740112, + "loss": 0.8398, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.48478742078050574, + "learning_rate": 0.00011855477538294935, + "loss": 0.8141, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.36934981231353725, + "learning_rate": 0.00011838494360112185, + "loss": 0.6759, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.6624303879159779, + "learning_rate": 0.00011821505691906216, + "loss": 0.8269, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4479024680557409, + "learning_rate": 0.00011804511584407763, + "loss": 0.7301, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.5909808951009032, + "learning_rate": 0.00011787512088363817, + "loss": 0.772, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.5209290099731972, + "learning_rate": 0.00011770507254537453, + "loss": 0.7593, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4770750875340812, + "learning_rate": 0.00011753497133707679, + "loss": 0.8468, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.4438156717825776, + "learning_rate": 0.00011736481776669306, + "loss": 0.8036, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.40077651122930297, + "learning_rate": 0.00011719461234232764, + "loss": 0.6876, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.382397124928795, + "learning_rate": 0.00011702435557223987, + "loss": 0.6042, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.5474722912103955, + "learning_rate": 0.00011685404796484225, + "loss": 0.7456, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.5240517045008879, + "learning_rate": 0.00011668369002869912, + "loss": 0.74, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.5834307542847271, + "learning_rate": 0.00011651328227252517, + "loss": 0.7741, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.5382694432201459, + "learning_rate": 0.00011634282520518383, + "loss": 0.87, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.567001530938856, + "learning_rate": 0.00011617231933568578, + "loss": 0.8846, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.44139727689976666, + "learning_rate": 0.00011600176517318741, + "loss": 0.757, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.586655799449206, + "learning_rate": 0.00011583116322698935, + "loss": 0.7793, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.6959512702435237, + "learning_rate": 0.00011566051400653486, + "loss": 0.9044, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.592929579373112, + "learning_rate": 0.00011548981802140848, + "loss": 0.7487, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.48794267966208865, + "learning_rate": 0.00011531907578133429, + "loss": 0.7603, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.4877169301389832, + "learning_rate": 0.00011514828779617459, + "loss": 0.6525, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.46896945750895147, + "learning_rate": 0.00011497745457592816, + "loss": 0.743, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.3997482721363189, + "learning_rate": 0.00011480657663072896, + "loss": 0.7099, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.5253024025415763, + "learning_rate": 0.00011463565447084445, + "loss": 0.7998, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.5408869645494739, + "learning_rate": 0.00011446468860667421, + "loss": 0.8468, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.500146352240882, + "learning_rate": 0.00011429367954874819, + "loss": 0.8356, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.5093589387038205, + "learning_rate": 0.0001141226278077254, + "loss": 0.761, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.5382482323711277, + "learning_rate": 0.00011395153389439233, + "loss": 0.7361, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.44435322450815196, + "learning_rate": 0.00011378039831966134, + "loss": 0.7391, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.5007857334530672, + "learning_rate": 0.00011360922159456928, + "loss": 0.7404, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6778260798197766, + "learning_rate": 0.00011343800423027582, + "loss": 0.8802, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.45643085924947757, + "learning_rate": 0.00011326674673806195, + "loss": 0.7668, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.42576546347475686, + "learning_rate": 0.00011309544962932862, + "loss": 0.7699, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4878184993169895, + "learning_rate": 0.0001129241134155949, + "loss": 0.7348, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.5948727960245362, + "learning_rate": 0.00011275273860849684, + "loss": 0.8081, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.6086558137800696, + "learning_rate": 0.00011258132571978555, + "loss": 0.7815, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4513654437774048, + "learning_rate": 0.00011240987526132594, + "loss": 0.7895, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.7062086064639138, + "learning_rate": 0.00011223838774509514, + "loss": 0.9012, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.5468225898868876, + "learning_rate": 0.00011206686368318086, + "loss": 0.8039, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.5036838194152392, + "learning_rate": 0.00011189530358778005, + "loss": 0.7766, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.6261411301740473, + "learning_rate": 0.00011172370797119712, + "loss": 0.8658, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.4494687514476649, + "learning_rate": 0.00011155207734584263, + "loss": 0.787, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.43209719743065006, + "learning_rate": 0.00011138041222423177, + "loss": 0.7668, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.4770864718919109, + "learning_rate": 0.00011120871311898254, + "loss": 0.7272, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.4745528891103059, + "learning_rate": 0.0001110369805428146, + "loss": 0.7449, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.7470359928641405, + "learning_rate": 0.00011086521500854745, + "loss": 0.7807, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.45471146705812854, + "learning_rate": 0.0001106934170290991, + "loss": 0.7722, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.4253950362738694, + "learning_rate": 0.00011052158711748434, + "loss": 0.7093, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.49454665074672327, + "learning_rate": 0.00011034972578681338, + "loss": 0.8657, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.6964158869594843, + "learning_rate": 0.00011017783355029026, + "loss": 0.8693, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.4414620619141758, + "learning_rate": 0.00011000591092121127, + "loss": 0.8054, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.4672882678215716, + "learning_rate": 0.00010983395841296348, + "loss": 0.7854, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.5087716215112642, + "learning_rate": 0.0001096619765390232, + "loss": 0.7391, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.4264799455950689, + "learning_rate": 0.00010948996581295436, + "loss": 0.6833, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5367640341135063, + "learning_rate": 0.00010931792674840718, + "loss": 0.7739, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.372192351444737, + "learning_rate": 0.00010914585985911632, + "loss": 0.7144, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.5048598526746167, + "learning_rate": 0.00010897376565889971, + "loss": 0.7033, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.5270412702986649, + "learning_rate": 0.00010880164466165674, + "loss": 0.7899, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.5481561010954568, + "learning_rate": 0.00010862949738136681, + "loss": 0.807, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.4236700078969928, + "learning_rate": 0.00010845732433208779, + "loss": 0.736, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5273943593421361, + "learning_rate": 0.00010828512602795462, + "loss": 0.8277, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.4372769409605662, + "learning_rate": 0.00010811290298317755, + "loss": 0.6718, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.5905463937231734, + "learning_rate": 0.00010794065571204072, + "loss": 0.7761, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4339096696432376, + "learning_rate": 0.00010776838472890065, + "loss": 0.7329, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.6385924503289472, + "learning_rate": 0.00010759609054818458, + "loss": 0.8094, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.6224472942858802, + "learning_rate": 0.00010742377368438914, + "loss": 0.7984, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.5987915588234599, + "learning_rate": 0.00010725143465207867, + "loss": 0.8015, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.48684483100299114, + "learning_rate": 0.00010707907396588361, + "loss": 0.6933, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.47889562425964494, + "learning_rate": 0.0001069066921404992, + "loss": 0.7893, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.4427783280540483, + "learning_rate": 0.00010673428969068364, + "loss": 0.7504, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.45953862660412464, + "learning_rate": 0.00010656186713125689, + "loss": 0.818, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.4632980682604805, + "learning_rate": 0.0001063894249770989, + "loss": 0.7654, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.4970646438155739, + "learning_rate": 0.00010621696374314807, + "loss": 0.7649, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.47855367676538474, + "learning_rate": 0.00010604448394439983, + "loss": 0.8348, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.5150032033217121, + "learning_rate": 0.00010587198609590505, + "loss": 0.7443, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.4827091017526877, + "learning_rate": 0.00010569947071276847, + "loss": 0.7594, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.5630702112689725, + "learning_rate": 0.00010552693831014726, + "loss": 0.8098, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.6480528114874299, + "learning_rate": 0.0001053543894032493, + "loss": 0.884, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5159217909240126, + "learning_rate": 0.00010518182450733186, + "loss": 0.7916, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.5262820068326394, + "learning_rate": 0.00010500924413769988, + "loss": 0.7374, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.4993473580240705, + "learning_rate": 0.00010483664880970457, + "loss": 0.7151, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4657089901590285, + "learning_rate": 0.00010466403903874176, + "loss": 0.724, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.5228105787943458, + "learning_rate": 0.00010449141534025045, + "loss": 0.7466, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.4133632297831274, + "learning_rate": 0.00010431877822971117, + "loss": 0.7487, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5962099027010531, + "learning_rate": 0.00010414612822264455, + "loss": 0.7504, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.5156682261075197, + "learning_rate": 0.00010397346583460971, + "loss": 0.7657, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.55049333424343, + "learning_rate": 0.0001038007915812028, + "loss": 0.8423, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.5652328546558053, + "learning_rate": 0.00010362810597805526, + "loss": 0.8679, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.5851425172299666, + "learning_rate": 0.0001034554095408326, + "loss": 0.914, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.5358604988343294, + "learning_rate": 0.00010328270278523256, + "loss": 0.8724, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4765769291896708, + "learning_rate": 0.0001031099862269837, + "loss": 0.7257, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.5555226218834401, + "learning_rate": 0.00010293726038184393, + "loss": 0.7924, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.48312215529746294, + "learning_rate": 0.00010276452576559879, + "loss": 0.7212, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4778138090491278, + "learning_rate": 0.00010259178289406011, + "loss": 0.7238, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.6689189049077776, + "learning_rate": 0.00010241903228306431, + "loss": 0.9201, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.4532393405967173, + "learning_rate": 0.0001022462744484709, + "loss": 0.7076, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.506970275115917, + "learning_rate": 0.00010207350990616107, + "loss": 0.79, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.46879915621380663, + "learning_rate": 0.00010190073917203589, + "loss": 0.7753, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.5004248052216412, + "learning_rate": 0.00010172796276201503, + "loss": 0.7596, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.48676595236330417, + "learning_rate": 0.0001015551811920351, + "loss": 0.8244, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.4012451103630835, + "learning_rate": 0.00010138239497804804, + "loss": 0.6793, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.5113472466549227, + "learning_rate": 0.00010120960463601976, + "loss": 0.7168, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.5585082337357712, + "learning_rate": 0.00010103681068192845, + "loss": 0.786, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.5132952281443672, + "learning_rate": 0.00010086401363176305, + "loss": 0.7239, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.5627208817373884, + "learning_rate": 0.00010069121400152181, + "loss": 0.8331, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.5185703694893471, + "learning_rate": 0.00010051841230721065, + "loss": 0.7443, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.5778757800808362, + "learning_rate": 0.0001003456090648416, + "loss": 0.7399, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.5194758506650604, + "learning_rate": 0.00010017280479043147, + "loss": 0.7753, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.49661070959338893, + "learning_rate": 0.0001, + "loss": 0.7752, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.48506621784549836, + "learning_rate": 9.982719520956855e-05, + "loss": 0.8398, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.4875209967744212, + "learning_rate": 9.965439093515841e-05, + "loss": 0.6914, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.5712704204753644, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7663, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.42577760098407713, + "learning_rate": 9.930878599847821e-05, + "loss": 0.7742, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.5145933577513895, + "learning_rate": 9.913598636823693e-05, + "loss": 0.8162, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.47544170946764996, + "learning_rate": 9.896318931807155e-05, + "loss": 0.749, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.47257430999421846, + "learning_rate": 9.879039536398024e-05, + "loss": 0.787, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.43052217825413647, + "learning_rate": 9.861760502195197e-05, + "loss": 0.7219, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.46573574169891335, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7606, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.4733087979911936, + "learning_rate": 9.827203723798498e-05, + "loss": 0.7155, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.4930793593232939, + "learning_rate": 9.809926082796415e-05, + "loss": 0.7838, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5797961248719126, + "learning_rate": 9.792649009383899e-05, + "loss": 0.8436, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.47705044869808927, + "learning_rate": 9.775372555152912e-05, + "loss": 0.7516, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.604131424977959, + "learning_rate": 9.758096771693573e-05, + "loss": 0.9429, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.44206954405281973, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7349, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.534673109019627, + "learning_rate": 9.723547423440122e-05, + "loss": 0.8102, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.5319890275331417, + "learning_rate": 9.70627396181561e-05, + "loss": 0.7864, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.5041767156879674, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7042, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.5009543847409859, + "learning_rate": 9.671729721476746e-05, + "loss": 0.7679, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.46028265141081554, + "learning_rate": 9.654459045916743e-05, + "loss": 0.7723, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.48486788993634405, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7767, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.5173426088972937, + "learning_rate": 9.619920841879725e-05, + "loss": 0.8181, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.5401368625618846, + "learning_rate": 9.602653416539031e-05, + "loss": 0.7998, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.49433598961855285, + "learning_rate": 9.585387177735547e-05, + "loss": 0.793, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.5585235720199297, + "learning_rate": 9.568122177028884e-05, + "loss": 0.8139, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.5752829228742448, + "learning_rate": 9.550858465974958e-05, + "loss": 0.7584, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.57631471280755, + "learning_rate": 9.533596096125825e-05, + "loss": 0.9417, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.4752286727403488, + "learning_rate": 9.516335119029546e-05, + "loss": 0.8116, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.5048984776307732, + "learning_rate": 9.499075586230013e-05, + "loss": 0.8326, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5016459109163391, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7255, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.48471486192604013, + "learning_rate": 9.464561059675073e-05, + "loss": 0.7633, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.5048034917358186, + "learning_rate": 9.44730616898528e-05, + "loss": 0.7139, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.5423514630028117, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7734, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.40773479485225583, + "learning_rate": 9.412801390409497e-05, + "loss": 0.7378, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.6044628207918135, + "learning_rate": 9.395551605560018e-05, + "loss": 0.9084, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.5387590843182661, + "learning_rate": 9.378303625685195e-05, + "loss": 0.8517, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.49638266994093244, + "learning_rate": 9.361057502290113e-05, + "loss": 0.7362, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.43680978163350903, + "learning_rate": 9.343813286874312e-05, + "loss": 0.7057, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.42401098881189553, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6663, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.5510951143045089, + "learning_rate": 9.309330785950086e-05, + "loss": 0.8326, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.41375524431208105, + "learning_rate": 9.292092603411641e-05, + "loss": 0.656, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.6699514086298767, + "learning_rate": 9.274856534792138e-05, + "loss": 0.8479, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.45454789301126036, + "learning_rate": 9.257622631561085e-05, + "loss": 0.7507, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.4518759794712517, + "learning_rate": 9.240390945181543e-05, + "loss": 0.6467, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.6425706473983626, + "learning_rate": 9.223161527109937e-05, + "loss": 0.9246, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.5434987449911853, + "learning_rate": 9.205934428795929e-05, + "loss": 0.8056, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.4492697269052785, + "learning_rate": 9.188709701682247e-05, + "loss": 0.771, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4606910163821094, + "learning_rate": 9.171487397204539e-05, + "loss": 0.8024, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.5384198728605855, + "learning_rate": 9.154267566791223e-05, + "loss": 0.763, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.6794293506762635, + "learning_rate": 9.137050261863324e-05, + "loss": 0.8719, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.6088738162385567, + "learning_rate": 9.119835533834331e-05, + "loss": 0.8365, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.5026043729795472, + "learning_rate": 9.102623434110028e-05, + "loss": 0.7689, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.4493042833001174, + "learning_rate": 9.085414014088369e-05, + "loss": 0.6673, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.4475146656097217, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6928, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.4493959992386878, + "learning_rate": 9.051003418704565e-05, + "loss": 0.7361, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.5658826869183532, + "learning_rate": 9.033802346097682e-05, + "loss": 0.8859, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.49869889355059516, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7865, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.4694526316805162, + "learning_rate": 8.999408907878877e-05, + "loss": 0.7074, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.601500875267324, + "learning_rate": 8.982216644970979e-05, + "loss": 0.7761, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.49570321716905996, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7538, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.5488827437321244, + "learning_rate": 8.947841288251568e-05, + "loss": 0.7841, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.4978453428689801, + "learning_rate": 8.930658297090091e-05, + "loss": 0.7965, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.5456067995327268, + "learning_rate": 8.913478499145254e-05, + "loss": 0.649, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.5890815714771167, + "learning_rate": 8.896301945718541e-05, + "loss": 0.7557, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.4134396118443379, + "learning_rate": 8.879128688101749e-05, + "loss": 0.6983, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4386094969310609, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7434, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.797563002235324, + "learning_rate": 8.844792265415738e-05, + "loss": 0.9317, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.4435294090073385, + "learning_rate": 8.827629202880293e-05, + "loss": 0.7477, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.4141554415288854, + "learning_rate": 8.810469641222001e-05, + "loss": 0.579, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.6895853306319035, + "learning_rate": 8.793313631681915e-05, + "loss": 0.8415, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.465860004697133, + "learning_rate": 8.776161225490489e-05, + "loss": 0.8201, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4413236626817581, + "learning_rate": 8.759012473867407e-05, + "loss": 0.765, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.5590586045272012, + "learning_rate": 8.741867428021446e-05, + "loss": 0.9927, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.5814757610274929, + "learning_rate": 8.724726139150318e-05, + "loss": 0.623, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.5456403867820663, + "learning_rate": 8.707588658440511e-05, + "loss": 0.8095, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.5698682014313946, + "learning_rate": 8.690455037067141e-05, + "loss": 0.8043, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.5645368222782049, + "learning_rate": 8.673325326193806e-05, + "loss": 0.7581, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5377566999357087, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7506, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.6185626048287002, + "learning_rate": 8.639077840543077e-05, + "loss": 0.7798, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.4220564255020212, + "learning_rate": 8.621960168033867e-05, + "loss": 0.7496, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.5271457551016745, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7927, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.5147393348972404, + "learning_rate": 8.587737219227462e-05, + "loss": 0.7178, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.46443478845199315, + "learning_rate": 8.570632045125185e-05, + "loss": 0.7054, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.7135634445888618, + "learning_rate": 8.553531139332582e-05, + "loss": 0.8207, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.42761424683265864, + "learning_rate": 8.536434552915556e-05, + "loss": 0.6581, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.42313988160867755, + "learning_rate": 8.519342336927105e-05, + "loss": 0.7775, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.4538095633072262, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7687, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.533422559952103, + "learning_rate": 8.485171220382545e-05, + "loss": 0.7408, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.45460544867317326, + "learning_rate": 8.468092421866573e-05, + "loss": 0.7805, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4527045555244706, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7938, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.5832336658005973, + "learning_rate": 8.433948599346516e-05, + "loss": 0.8422, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.48220157807827396, + "learning_rate": 8.416883677301069e-05, + "loss": 0.7968, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.6651001691852388, + "learning_rate": 8.399823482681262e-05, + "loss": 1.0221, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.4737128702852164, + "learning_rate": 8.382768066431425e-05, + "loss": 0.742, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.5965628803991697, + "learning_rate": 8.36571747948162e-05, + "loss": 0.7871, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4601936919160773, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7287, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.5502984364373679, + "learning_rate": 8.33163099713009e-05, + "loss": 0.7854, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.6058604776648268, + "learning_rate": 8.31459520351578e-05, + "loss": 0.7834, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.6349157785706616, + "learning_rate": 8.297564442776014e-05, + "loss": 0.9179, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.5103853059024941, + "learning_rate": 8.280538765767235e-05, + "loss": 0.8123, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.5012969596052058, + "learning_rate": 8.263518223330697e-05, + "loss": 0.8938, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.49085332449675545, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7317, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.46192859143681336, + "learning_rate": 8.22949274546255e-05, + "loss": 0.7479, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.7572527685234165, + "learning_rate": 8.212487911636184e-05, + "loss": 0.799, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.43048805052859346, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6879, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.6290092415581088, + "learning_rate": 8.178494308093789e-05, + "loss": 0.8611, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.6896349768871635, + "learning_rate": 8.161505639887817e-05, + "loss": 0.8524, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.45147054762589583, + "learning_rate": 8.144522461705067e-05, + "loss": 0.744, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.5046745539072711, + "learning_rate": 8.127544824259889e-05, + "loss": 0.8061, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.4450247446607071, + "learning_rate": 8.110572778250085e-05, + "loss": 0.729, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.4991655316273437, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7505, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.6021083483612211, + "learning_rate": 8.076645663244168e-05, + "loss": 0.9119, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.4714342234434051, + "learning_rate": 8.059690695559568e-05, + "loss": 0.7631, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.5178330816276597, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7748, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.4132243325540413, + "learning_rate": 8.025798192977481e-05, + "loss": 0.613, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.44929399257605435, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6669, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.5344871820450173, + "learning_rate": 7.991929271442817e-05, + "loss": 0.806, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.5158510559476444, + "learning_rate": 7.975003780001485e-05, + "loss": 0.704, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.5881917075890982, + "learning_rate": 7.958084335506239e-05, + "loss": 0.8393, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4448194722905415, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7278, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.4415817363764343, + "learning_rate": 7.924263789431912e-05, + "loss": 0.759, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.43254297501106265, + "learning_rate": 7.907362788846116e-05, + "loss": 0.7259, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.4955340970115768, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7804, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.4127057054580811, + "learning_rate": 7.873579584921869e-05, + "loss": 0.5882, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.4916154955577541, + "learning_rate": 7.856697482465196e-05, + "loss": 0.7531, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.6032246923125663, + "learning_rate": 7.839821780235168e-05, + "loss": 0.9182, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.5482636793168754, + "learning_rate": 7.822952528625191e-05, + "loss": 0.7806, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.5902884522150984, + "learning_rate": 7.806089778009421e-05, + "loss": 0.8417, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.6028871218261679, + "learning_rate": 7.789233578742582e-05, + "loss": 0.9686, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.5386609305695303, + "learning_rate": 7.772383981159849e-05, + "loss": 0.7806, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.5367112850841399, + "learning_rate": 7.755541035576677e-05, + "loss": 0.832, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.47813459747579856, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7458, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.4460385373404532, + "learning_rate": 7.721875301571359e-05, + "loss": 0.7462, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.4456182927472116, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6813, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.46856392814854575, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7422, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.4539049855895284, + "learning_rate": 7.671427847296275e-05, + "loss": 0.689, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.46288916807437674, + "learning_rate": 7.654625869212146e-05, + "loss": 0.7945, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.5320408164886675, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7668, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.5043155275745976, + "learning_rate": 7.6210429741257e-05, + "loss": 0.7325, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.46453023128837523, + "learning_rate": 7.604262157407007e-05, + "loss": 0.8079, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4749269451246347, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7462, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.514586186902165, + "learning_rate": 7.570722036168854e-05, + "loss": 0.7458, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.6714374876984359, + "learning_rate": 7.55396283180529e-05, + "loss": 0.9248, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.4893196173882575, + "learning_rate": 7.537210931679987e-05, + "loss": 0.804, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.4027566299206927, + "learning_rate": 7.520466385816671e-05, + "loss": 0.6574, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.46239830297712675, + "learning_rate": 7.503729244217086e-05, + "loss": 0.6244, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.5212034179279587, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7334, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.5563132862800897, + "learning_rate": 7.470277373705461e-05, + "loss": 0.795, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.485130898303703, + "learning_rate": 7.453562744685778e-05, + "loss": 0.7568, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.6083852959382066, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7011, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.4750973409821206, + "learning_rate": 7.42015634868062e-05, + "loss": 0.6985, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.5748068195170376, + "learning_rate": 7.403464681451715e-05, + "loss": 0.7349, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.5506747179570219, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7912, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.41354805481557877, + "learning_rate": 7.370104657760361e-05, + "loss": 0.7004, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.4160478551351224, + "learning_rate": 7.353436400916004e-05, + "loss": 0.6634, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.40059068613912047, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7775, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.46017198399877646, + "learning_rate": 7.320123646099519e-05, + "loss": 0.7304, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.5902819462109755, + "learning_rate": 7.303479247604332e-05, + "loss": 0.8914, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.5406386880085818, + "learning_rate": 7.286842901329412e-05, + "loss": 0.8171, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.41901452797853955, + "learning_rate": 7.270214656953415e-05, + "loss": 0.6702, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.553698688570677, + "learning_rate": 7.253594564130804e-05, + "loss": 0.8016, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5513780808148544, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7856, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.46661180535210856, + "learning_rate": 7.22037903164173e-05, + "loss": 0.6399, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.5504463888041178, + "learning_rate": 7.203783691161883e-05, + "loss": 0.8997, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.5375101790560792, + "learning_rate": 7.187196700608373e-05, + "loss": 0.732, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.5098924546783888, + "learning_rate": 7.170618109512465e-05, + "loss": 0.7611, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.5384476107750127, + "learning_rate": 7.154047967380354e-05, + "loss": 0.852, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.5350115576094381, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7675, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.40574771665868714, + "learning_rate": 7.12093322790597e-05, + "loss": 0.707, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.49779397012835314, + "learning_rate": 7.104388729449338e-05, + "loss": 0.7301, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3731798846707171, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6111, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.4850756352758892, + "learning_rate": 7.071325722118963e-05, + "loss": 0.7519, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.6139684422255738, + "learning_rate": 7.054807311976379e-05, + "loss": 0.7989, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.4434011240395176, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6933, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.49808530175436067, + "learning_rate": 7.021796925368667e-05, + "loss": 0.8281, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.4681737004455478, + "learning_rate": 7.005305047477566e-05, + "loss": 0.7115, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.6399339214620106, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7812, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.45419802550361577, + "learning_rate": 6.972348168756983e-05, + "loss": 0.7287, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.48160977267166555, + "learning_rate": 6.955883266341741e-05, + "loss": 0.694, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.47089987083089757, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7516, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.4769968094972423, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7607, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.5113445305578702, + "learning_rate": 6.906543296794714e-05, + "loss": 0.7669, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.5951036250978381, + "learning_rate": 6.890115049885994e-05, + "loss": 0.8113, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.4085341532710229, + "learning_rate": 6.873696089565786e-05, + "loss": 0.6495, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.45731725676548396, + "learning_rate": 6.85728646486359e-05, + "loss": 0.7541, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.47515240745984405, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7613, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.45536664362280865, + "learning_rate": 6.82449541829174e-05, + "loss": 0.6845, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.5941606028102293, + "learning_rate": 6.80811409434113e-05, + "loss": 0.9209, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.4863295015502449, + "learning_rate": 6.791742301846326e-05, + "loss": 0.799, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.45670743525230284, + "learning_rate": 6.775380089695986e-05, + "loss": 0.6918, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.4927898296403875, + "learning_rate": 6.759027506750158e-05, + "loss": 0.7508, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.6233914888731245, + "learning_rate": 6.742684601840141e-05, + "loss": 0.8231, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.536198709515224, + "learning_rate": 6.726351423768322e-05, + "loss": 0.793, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.48756188330078964, + "learning_rate": 6.710028021308061e-05, + "loss": 0.6774, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.5712139069480937, + "learning_rate": 6.693714443203507e-05, + "loss": 0.8973, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.5492260719671762, + "learning_rate": 6.677410738169485e-05, + "loss": 0.7927, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.43602755004802324, + "learning_rate": 6.661116954891328e-05, + "loss": 0.7323, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.42252395128406706, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7631, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.4062666723274316, + "learning_rate": 6.62855934819569e-05, + "loss": 0.6829, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.4744672048807581, + "learning_rate": 6.612295622000162e-05, + "loss": 0.776, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.49002783081861784, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7178, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.5824978436248245, + "learning_rate": 6.579798566743314e-05, + "loss": 0.75, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.46847652704699216, + "learning_rate": 6.563565334723134e-05, + "loss": 0.7302, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.6688715442609833, + "learning_rate": 6.547342364418481e-05, + "loss": 0.8042, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.6125074641738367, + "learning_rate": 6.531129704273604e-05, + "loss": 0.7949, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.5060782840915192, + "learning_rate": 6.514927402701964e-05, + "loss": 0.8076, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.6730189494287294, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7791, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.645212680644595, + "learning_rate": 6.48255406877745e-05, + "loss": 0.7827, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.45367478391855637, + "learning_rate": 6.466383133096267e-05, + "loss": 0.7507, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.45461517927777706, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6742, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.5659063461270785, + "learning_rate": 6.434072965740242e-05, + "loss": 0.833, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.5025047825967893, + "learning_rate": 6.417933830548467e-05, + "loss": 0.7629, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.5556115753964004, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7558, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.49676283537854166, + "learning_rate": 6.385687698106781e-05, + "loss": 0.8382, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.47517253374761986, + "learning_rate": 6.369580797148718e-05, + "loss": 0.8045, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4261562907967871, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6711, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.5090365893278739, + "learning_rate": 6.337399566246257e-05, + "loss": 0.7895, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.5179103729664175, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6724, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.4885247064850998, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7453, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.40492027858978047, + "learning_rate": 6.289209867917312e-05, + "loss": 0.648, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.4311728644577337, + "learning_rate": 6.273168733182722e-05, + "loss": 0.7179, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4371474337856531, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6316, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.4676358611503632, + "learning_rate": 6.241119898233144e-05, + "loss": 0.777, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.6847560977935614, + "learning_rate": 6.225112293720836e-05, + "loss": 0.7608, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.5544948643545228, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7362, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.9224829073702737, + "learning_rate": 6.19313094962673e-05, + "loss": 0.9211, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.48080644250290394, + "learning_rate": 6.177157305546078e-05, + "loss": 0.7111, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4739551772230298, + "learning_rate": 6.161195077053976e-05, + "loss": 0.778, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.5312235651493865, + "learning_rate": 6.145244311816063e-05, + "loss": 0.7499, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.6559582873219216, + "learning_rate": 6.129305057463741e-05, + "loss": 0.8333, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.5586187196621849, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7403, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.4835618153284476, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6013, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.44762718065527035, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.7476, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.4565399260657701, + "learning_rate": 6.065664100332478e-05, + "loss": 0.8032, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.5357186140583011, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.793, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.5570464751871326, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.8484, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4569021890750186, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7876, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.4235006099986388, + "learning_rate": 6.002211118886514e-05, + "loss": 0.7685, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.4407742074198759, + "learning_rate": 5.986377600199371e-05, + "loss": 0.7324, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5090580952169202, + "learning_rate": 5.970556066797941e-05, + "loss": 0.8099, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.4501523120592187, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.8108, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.4390198907367184, + "learning_rate": 5.938949144798279e-05, + "loss": 0.6698, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.4872262737377709, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7206, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.4078671554593861, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6716, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.5328962011820283, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.8441, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4505548197220081, + "learning_rate": 5.875881200614207e-05, + "loss": 0.7188, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.5028372157738586, + "learning_rate": 5.860144885064751e-05, + "loss": 0.7702, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.5881035368653048, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.8102, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.4644208673093524, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7863, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.9184436029738751, + "learning_rate": 5.813010299610313e-05, + "loss": 0.7369, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.5238806668810477, + "learning_rate": 5.797323714580192e-05, + "loss": 0.8379, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.573507459169721, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7635, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.43712869771798696, + "learning_rate": 5.765988240812921e-05, + "loss": 0.6759, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.51492141619251, + "learning_rate": 5.750339445648252e-05, + "loss": 0.7879, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.5589420565939197, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7274, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.6130439212218944, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.8891, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.4423169657489094, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.7818, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.44932253972913216, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7049, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.427777477401564, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.7368, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.4174624721940442, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.7708, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5562260719661347, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7334, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.632717622904955, + "learning_rate": 5.625609846363622e-05, + "loss": 0.7849, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.4359625588912055, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.6654, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.41891582421355134, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7579, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.42159163914455067, + "learning_rate": 5.579050500768836e-05, + "loss": 0.6646, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.5811895061510105, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.7739, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.582367703020442, + "learning_rate": 5.54807686792933e-05, + "loss": 0.8764, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.4975177901929358, + "learning_rate": 5.53260996957381e-05, + "loss": 0.7879, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.46707907109971164, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.7714, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.40234271285614664, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6424, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.5354746693833636, + "learning_rate": 5.486289500882355e-05, + "loss": 0.7484, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.5277899940401004, + "learning_rate": 5.47087624046575e-05, + "loss": 0.7593, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.8114151816548788, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7835, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.5238447886409662, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.7476, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.4034433634699191, + "learning_rate": 5.424717791025302e-05, + "loss": 0.6916, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.49701716112196237, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7838, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.62062323297395, + "learning_rate": 5.394013727258254e-05, + "loss": 0.8154, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.5068903167924839, + "learning_rate": 5.378682303724435e-05, + "loss": 0.7607, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.460139403860215, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7481, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.46487020590647377, + "learning_rate": 5.348060902265871e-05, + "loss": 0.7388, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.48953201444434585, + "learning_rate": 5.332771015781275e-05, + "loss": 0.7033, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.5136215721502551, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7526, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.47109621415579506, + "learning_rate": 5.302233099590928e-05, + "loss": 0.741, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5502634222197873, + "learning_rate": 5.286985161076029e-05, + "loss": 0.8411, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.5134976989258431, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7553, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.4898318259638587, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.6702, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.5227245331435747, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6482, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.5128301572867259, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7159, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.576890268074048, + "learning_rate": 5.210957484346314e-05, + "loss": 0.7952, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.432253950885244, + "learning_rate": 5.195794670011776e-05, + "loss": 0.7143, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.41431838996966186, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6468, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.6219635281970626, + "learning_rate": 5.165512124837344e-05, + "loss": 0.7979, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.47001155233193037, + "learning_rate": 5.150392484425728e-05, + "loss": 0.7523, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.4731698920729862, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6733, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.48348627346619377, + "learning_rate": 5.120196693701267e-05, + "loss": 0.6778, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.5373809731810197, + "learning_rate": 5.105120633557634e-05, + "loss": 0.7862, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4302728310666555, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7408, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.4502300529686932, + "learning_rate": 5.075012408804458e-05, + "loss": 0.674, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.4749352377536877, + "learning_rate": 5.059980334102637e-05, + "loss": 0.6564, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4961234868019811, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7162, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.5992313824361511, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.8486, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.5125638267756718, + "learning_rate": 5.014972799220403e-05, + "loss": 0.6822, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.6175810913004475, + "learning_rate": 5.000000000000002e-05, + "loss": 0.8007, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.4591985784213636, + "learning_rate": 4.985042131538545e-05, + "loss": 0.6769, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.39331101563430665, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.7082, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5101318179916204, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7578, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.7519008983138528, + "learning_rate": 4.940258557148765e-05, + "loss": 0.7503, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.5437703016808688, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.675, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.5086423764723533, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6744, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.43966489761898947, + "learning_rate": 4.895610964891923e-05, + "loss": 0.6865, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.6480433410332094, + "learning_rate": 4.880758859890536e-05, + "loss": 0.8626, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5436825086049654, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7899, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.44657371639546795, + "learning_rate": 4.851100554686021e-05, + "loss": 0.7399, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.5676457737282187, + "learning_rate": 4.836294443047088e-05, + "loss": 0.842, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.41702442978796533, + "learning_rate": 4.821503751016746e-05, + "loss": 0.67, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.4708337288708888, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.7049, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.5326747190581289, + "learning_rate": 4.791968802404648e-05, + "loss": 0.7529, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.478308999115387, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6871, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.39650223263914064, + "learning_rate": 4.762496061632814e-05, + "loss": 0.6837, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.5370217535675965, + "learning_rate": 4.747783129228656e-05, + "loss": 0.7394, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.5541533985113412, + "learning_rate": 4.733085880741301e-05, + "loss": 0.8083, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.612301493210298, + "learning_rate": 4.718404360058966e-05, + "loss": 0.841, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.5565547179646171, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.8259, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.560703868530753, + "learning_rate": 4.689088677427249e-05, + "loss": 0.8143, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.5452429098540207, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.657, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.7076748620068148, + "learning_rate": 4.659836431497563e-05, + "loss": 0.8688, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.4723531210225006, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7277, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.38012171936952244, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6425, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.603421292079894, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.8161, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5015500231918112, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7039, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.4976613648134939, + "learning_rate": 4.586985643347717e-05, + "loss": 0.6392, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.42231826227758135, + "learning_rate": 4.572463804170263e-05, + "loss": 0.597, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.5261772097736712, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6993, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.5720749507167654, + "learning_rate": 4.543468791472131e-05, + "loss": 0.7378, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.4854951914620495, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.7636, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.6185097610107433, + "learning_rate": 4.514538954847064e-05, + "loss": 0.8268, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.4211057093206592, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.6306, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.438023596550362, + "learning_rate": 4.485674639850333e-05, + "loss": 0.7062, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.5537641063198323, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7415, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.49633656204916216, + "learning_rate": 4.456876191254582e-05, + "loss": 0.7946, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.5343680050628211, + "learning_rate": 4.442501774383515e-05, + "loss": 0.774, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.6698593891009573, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7576, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.40126281760654753, + "learning_rate": 4.413802770115816e-05, + "loss": 0.6637, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.4262446862725171, + "learning_rate": 4.399478268418771e-05, + "loss": 0.72, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4350305002054411, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6839, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.4811962513455146, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.7616, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.3827499220134474, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6725, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.49317883787553995, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7569, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.434883042566961, + "learning_rate": 4.328107473805487e-05, + "loss": 0.6309, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.5796238083418784, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.8339, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.45368962247559386, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6974, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.4715066554294283, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.6997, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.5473163389609469, + "learning_rate": 4.271315449981934e-05, + "loss": 0.7799, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.50186115330255, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7963, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.4587261665217103, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.7275, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.4527605528114259, + "learning_rate": 4.228900904120895e-05, + "loss": 0.7458, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.5838291596429387, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.8659, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.512717733298456, + "learning_rate": 4.200710636738189e-05, + "loss": 0.6596, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.5987342949739224, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.6745, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.5129825298493389, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7741, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.46113293033011576, + "learning_rate": 4.158555222253771e-05, + "loss": 0.7784, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.48071864996884056, + "learning_rate": 4.14453824841132e-05, + "loss": 0.6686, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4857320789554157, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7695, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.5172260493693579, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.7837, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.4839080726160306, + "learning_rate": 4.102592405835536e-05, + "loss": 0.7814, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5369781118832275, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7994, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.40285291682654806, + "learning_rate": 4.074716493968975e-05, + "loss": 0.6685, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.43913336406413467, + "learning_rate": 4.060805057932359e-05, + "loss": 0.6789, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.36850949745938216, + "learning_rate": 4.046911357233343e-05, + "loss": 0.5985, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.45256461892901884, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.6901, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.502801265442768, + "learning_rate": 4.019177327749822e-05, + "loss": 0.7156, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5754365140036534, + "learning_rate": 4.00533708178334e-05, + "loss": 0.8076, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.5397331045353989, + "learning_rate": 3.991514736790258e-05, + "loss": 0.7374, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.5325726924140836, + "learning_rate": 3.977710334046193e-05, + "loss": 0.7451, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.8225720471674686, + "learning_rate": 3.963923914773187e-05, + "loss": 0.9019, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.3863498805878031, + "learning_rate": 3.950155520139581e-05, + "loss": 0.6633, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.47018195009456326, + "learning_rate": 3.936405191259891e-05, + "loss": 0.7058, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4848003787682862, + "learning_rate": 3.922672969194686e-05, + "loss": 0.8032, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.5708191505695828, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.6495, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.4856249306646381, + "learning_rate": 3.895263009479534e-05, + "loss": 0.7603, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.5102703988217077, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7876, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.47570898672635514, + "learning_rate": 3.867925968395085e-05, + "loss": 0.717, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.43657326247985034, + "learning_rate": 3.854284894414122e-05, + "loss": 0.6536, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.540699085510768, + "learning_rate": 3.840662172471315e-05, + "loss": 0.8676, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.45587842900608117, + "learning_rate": 3.82705784324618e-05, + "loss": 0.6982, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.5504378504505844, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.7597, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.5536472829823509, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6834, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.5188680535148331, + "learning_rate": 3.786355617847385e-05, + "loss": 0.6965, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.4464326809238666, + "learning_rate": 3.772825265187802e-05, + "loss": 0.7012, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4429020145145274, + "learning_rate": 3.759313507817196e-05, + "loss": 0.715, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.49953986575210324, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.719, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.5489530610657923, + "learning_rate": 3.732345940279893e-05, + "loss": 0.8231, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.5161147727463107, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7433, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.5372609302833079, + "learning_rate": 3.705453237352227e-05, + "loss": 0.7723, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.6341247855319071, + "learning_rate": 3.692035060534088e-05, + "loss": 0.7647, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4109938086219669, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6744, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.505211435260148, + "learning_rate": 3.665255256532638e-05, + "loss": 0.6792, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.5648569797645943, + "learning_rate": 3.651893709317887e-05, + "loss": 0.7383, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.4893528457125704, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6933, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.4045184960271441, + "learning_rate": 3.625227523958252e-05, + "loss": 0.6603, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.5861357493096865, + "learning_rate": 3.611922965442648e-05, + "loss": 0.822, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.38795550705368015, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.645, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.522397763584867, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.7986, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.45390817918484644, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.7065, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.5231333092724972, + "learning_rate": 3.558895885496023e-05, + "loss": 0.8067, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.43445809413391945, + "learning_rate": 3.545687101972013e-05, + "loss": 0.744, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.3844381213204554, + "learning_rate": 3.53249759200601e-05, + "loss": 0.6569, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4677279618543648, + "learning_rate": 3.519327394983888e-05, + "loss": 0.7083, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.4924843715111023, + "learning_rate": 3.506176550233863e-05, + "loss": 0.7512, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.4877946483819527, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6858, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4464224687908001, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6764, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.563939766074518, + "learning_rate": 3.46684052203088e-05, + "loss": 0.8946, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.4595855410253218, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.7412, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.5629112802578629, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7561, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.4232588748038259, + "learning_rate": 3.427680074531113e-05, + "loss": 0.6763, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.5377103251563394, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.7588, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.49726972943186487, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6818, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.39578621570415834, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6593, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.6296991797938082, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.8284, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4689559507210876, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6648, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.6139376355920769, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.6431, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.502667099627591, + "learning_rate": 3.336994413891828e-05, + "loss": 0.7902, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.4718534691657566, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6981, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.42320038156633955, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.7106, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.527489408575544, + "learning_rate": 3.298426809706928e-05, + "loss": 0.7137, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.642591509139478, + "learning_rate": 3.285610914348332e-05, + "loss": 0.8138, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.5022455342624368, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.7695, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.528433410560178, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.7774, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.40181853778869725, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6488, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.5703147394371794, + "learning_rate": 3.234548216567049e-05, + "loss": 0.8427, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.6066274810399689, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.7415, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.5318681701221755, + "learning_rate": 3.209137931341143e-05, + "loss": 0.7071, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.43522551039961643, + "learning_rate": 3.196463187590929e-05, + "loss": 0.6577, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.48382212436414673, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.7464, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.47912591586128217, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7578, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.5554221788061645, + "learning_rate": 3.158561005793402e-05, + "loss": 0.7379, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.46179248308158116, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6997, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5520896260902555, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6774, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.716871579835956, + "learning_rate": 3.120842689807468e-05, + "loss": 0.7016, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.4688654344543536, + "learning_rate": 3.108310952230212e-05, + "loss": 0.6842, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.45952842117017356, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6954, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.6037020858318494, + "learning_rate": 3.083309253324651e-05, + "loss": 0.9216, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.5112284522906911, + "learning_rate": 3.070839366655215e-05, + "loss": 0.7414, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.6778229380656393, + "learning_rate": 3.058390171511196e-05, + "loss": 0.96, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.5318024704287667, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.8142, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.5504999061969572, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6299, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.7168605084150267, + "learning_rate": 3.021167106673928e-05, + "loss": 0.8434, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.5326176287349795, + "learning_rate": 3.008801048763914e-05, + "loss": 0.7535, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.5198798992229317, + "learning_rate": 2.996455867635155e-05, + "loss": 0.7693, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.5062151710695797, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.8153, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.5339691423415097, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.8945, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.7436470699997376, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.8291, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.5341457861939325, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.7916, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.49275760119798284, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.7193, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.5435704259557418, + "learning_rate": 2.922825253307947e-05, + "loss": 0.7764, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5818631696995898, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.8297, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.383556959342635, + "learning_rate": 2.898450393337977e-05, + "loss": 0.737, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.453342843949634, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.7155, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.4352936908218544, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6515, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.5836509030705093, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.8344, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.49040324968759913, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.6792, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4928716736478312, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.74, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.41012802144608346, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.578, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.5183382299695679, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.8857, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.5061749397926986, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.7264, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.5191869515210203, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.706, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.7585326022201629, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.7991, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.42278581011950195, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6932, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.4679848278643742, + "learning_rate": 2.753992680872457e-05, + "loss": 0.687, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.48323632128045796, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.7556, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.4557423553247412, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7141, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.5294898419459036, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.8618, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.5274095672286221, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.7719, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.502821859324942, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.7703, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.5882461866882848, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.7777, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.5100547868843539, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.7625, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.6289157409884821, + "learning_rate": 2.659414712405398e-05, + "loss": 0.8166, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.5246731496364857, + "learning_rate": 2.647690737490106e-05, + "loss": 0.7315, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.5517644110475223, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.7715, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.5750468069036211, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7753, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.571760864960472, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.7922, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.3844181462637176, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.7023, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.43877818166129945, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6706, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.3868466487357469, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6642, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.492322352337929, + "learning_rate": 2.566239608465838e-05, + "loss": 0.6917, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.5446397964981483, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7429, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.5543790229843352, + "learning_rate": 2.543167122732918e-05, + "loss": 0.7267, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.5247544238266016, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.7613, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.5039706234528507, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7619, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.5713153815073273, + "learning_rate": 2.508725484101684e-05, + "loss": 0.7059, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.44923373773430025, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.7205, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5263321786867668, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7533, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.4240566695575976, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.7257, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.4737882576220856, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.7747, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.4316557468939239, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6621, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.5249481145085421, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.7565, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.42462653959991414, + "learning_rate": 2.429146201687538e-05, + "loss": 0.72, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4443353759436208, + "learning_rate": 2.417867893002387e-05, + "loss": 0.7001, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.573134473668386, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6942, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.4585178063664748, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.7028, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4948736403806755, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6499, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.49334388896702935, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.7077, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.6176119339584434, + "learning_rate": 2.361816641743303e-05, + "loss": 0.7847, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.43537482237087494, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6734, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.4313016572978186, + "learning_rate": 2.339555568810221e-05, + "loss": 0.7047, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.440581929957751, + "learning_rate": 2.328459328616759e-05, + "loss": 0.7409, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.47207732516459877, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7346, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.44963953263791284, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6737, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.5650758578094732, + "learning_rate": 2.295308190543859e-05, + "loss": 0.7682, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.50788328603849, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.7383, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.5799854089956136, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.8206, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.5175275043881233, + "learning_rate": 2.262364118471805e-05, + "loss": 0.6883, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.6954639855386038, + "learning_rate": 2.251428928971102e-05, + "loss": 0.802, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.5270117727580154, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.7876, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.44282138308302416, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.6937, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.44350813061965494, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7533, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.4290269336099863, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6326, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.47516800700789324, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.6657, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.44226535940405953, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6882, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.6018934743437193, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.8168, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.49401317098463216, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.7495, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.46866523180432645, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6789, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.5246029946716454, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6717, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.5835799171355602, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.7297, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.5048177896739099, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6995, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.4361443087723303, + "learning_rate": 2.111388852214001e-05, + "loss": 0.6656, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.6058728372984273, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.7824, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.6374772059823162, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.8744, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.4282587409275975, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.6471, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.429099311758806, + "learning_rate": 2.069097260929439e-05, + "loss": 0.6038, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.46101280173127135, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6908, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.4437725012180013, + "learning_rate": 2.048093436450603e-05, + "loss": 0.6629, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.7348344852509376, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.9117, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.5289879732485965, + "learning_rate": 2.027184594300898e-05, + "loss": 0.7368, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.4774681909052075, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.7869, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.4391310441341285, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.6039, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.6015495262618114, + "learning_rate": 1.995999968955641e-05, + "loss": 0.8394, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.5681477525141515, + "learning_rate": 1.985652854842247e-05, + "loss": 0.7568, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.5289834794894012, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.7474, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4754287143901146, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7412, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.49668724912942624, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.7087, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.47088805597233857, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.6956, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.5627325429138672, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7543, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.4611693226301201, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.6894, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.45336038424072905, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.7619, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.48155579631399054, + "learning_rate": 1.903740076395151e-05, + "loss": 0.799, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.5930487351489279, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.75, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.45997113728666444, + "learning_rate": 1.883503039577894e-05, + "loss": 0.7387, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.4772986674601558, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7531, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.342855727151531, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.5577, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.5207852106715284, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.7428, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4480357947246842, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6931, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.5086770454566198, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.7593, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.4999233008690756, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.7642, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.43508693288352246, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7282, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.5929791884787343, + "learning_rate": 1.803526775107217e-05, + "loss": 0.8132, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.4793522536878096, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.6612, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4082634546055162, + "learning_rate": 1.783776873795994e-05, + "loss": 0.7086, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.5522124987414765, + "learning_rate": 1.773938710748706e-05, + "loss": 0.7897, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.49094203430688615, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.7265, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.5446964417982316, + "learning_rate": 1.754336106761927e-05, + "loss": 0.7531, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.47350783687561393, + "learning_rate": 1.744571724358789e-05, + "loss": 0.7659, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.47482720663461636, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.7414, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.4908705406803905, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7528, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.4250681206948574, + "learning_rate": 1.715426605184407e-05, + "loss": 0.6601, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.5343277827022553, + "learning_rate": 1.705761004839911e-05, + "loss": 0.7508, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.5335001480302303, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6447, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.5026620625081853, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6882, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.5794331730821927, + "learning_rate": 1.676912926028007e-05, + "loss": 0.8791, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.44335294230259914, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.646, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.5489816260124318, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.8286, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.5339060440220494, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.776, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.3975566328563563, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7191, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.7170117974700864, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.6505, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.5443132706187244, + "learning_rate": 1.619888594394382e-05, + "loss": 0.7023, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.6813717234168647, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.8853, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.42647268339192634, + "learning_rate": 1.601080376443763e-05, + "loss": 0.6592, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.5076398265848955, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.6896, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.5395449562699478, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7446, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.4397096635647227, + "learning_rate": 1.573056222621453e-05, + "loss": 0.7337, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.4661381882198113, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.7243, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.5304568695304118, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.686, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.5139764135159359, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.7702, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.4789455046031435, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6182, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.6320559508800366, + "learning_rate": 1.526852950422226e-05, + "loss": 0.7692, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.4280059561481728, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6595, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.5164363899718144, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.7251, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5049914807371246, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.684, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.5739698211966615, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.7271, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.5201445604164033, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.7586, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.5842073507004636, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7876, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.5006665125106674, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.75, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.44481322280762703, + "learning_rate": 1.454244833620102e-05, + "loss": 0.6589, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.4780565108382196, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.7133, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.6376601193425918, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.7181, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.4881973656946014, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.6112, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.4847015050679535, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7145, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.42890771448161547, + "learning_rate": 1.409693244743192e-05, + "loss": 0.6892, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.609872097613963, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.802, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.7133164250244552, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.8849, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.6242493206708636, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.6744, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.5168845067044241, + "learning_rate": 1.37451354812416e-05, + "loss": 0.666, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.5621679580990633, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.7005, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.6475097082467258, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.699, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.4777742852510045, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.7145, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.5394317307286133, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7658, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.507781052328838, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6443, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.4927595899122509, + "learning_rate": 1.322517230541096e-05, + "loss": 0.744, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.5229718780304551, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.7315, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.44984755894237033, + "learning_rate": 1.30539214797198e-05, + "loss": 0.6701, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.5030223372803909, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.754, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.6429144293745496, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7469, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.5339353774693559, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.7461, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.5074697215392797, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.7243, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.5787063907047957, + "learning_rate": 1.263034245443473e-05, + "loss": 0.8122, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.46474831994188626, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.6343, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.4647701639052436, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.5977, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.45392179343711336, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6932, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.541610814787558, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.6834, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.46960670020822476, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.6951, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.5288161163972965, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.734, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.44749151366752427, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.7222, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.44108969518091085, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.6188, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.4740794448001084, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.5919, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.5574353313596002, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.7609, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.46755971845860866, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.6999, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.482119103447549, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7095, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.3913894476048914, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.6112, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.4993711521302121, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.7427, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.47052895381251125, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6776, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.5618426674787269, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.7462, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.5218744908763494, + "learning_rate": 1.123914688596409e-05, + "loss": 0.7113, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.6189167119642783, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.7796, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.5042245301115625, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.772, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.6297885943248943, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.9115, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.5568572099403468, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7492, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.6810557755957365, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.8225, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.5607435337013624, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.7527, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.4515840060236342, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6808, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.6801113082103843, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.8237, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.4562347313207618, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.7752, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.6472491506694351, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7549, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.5321472350862856, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.7525, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.4816956212107591, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.6833, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.6643577154994802, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.8251, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.5978505249505609, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.8071, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.5947337349744622, + "learning_rate": 1.007519208596045e-05, + "loss": 0.813, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.49178024775865825, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7759, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.4975472909610797, + "learning_rate": 9.924546254786493e-06, + "loss": 0.7224, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.6241219647197276, + "learning_rate": 9.849626695403324e-06, + "loss": 0.7566, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.42948075558014465, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6733, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.3953019485702067, + "learning_rate": 9.700595407649805e-06, + "loss": 0.7235, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.43381927608250337, + "learning_rate": 9.62648412430951e-06, + "loss": 0.6811, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.4883812440087532, + "learning_rate": 9.552642710005299e-06, + "loss": 0.7771, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.4012524999491689, + "learning_rate": 9.479071385238892e-06, + "loss": 0.564, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.45407652559344924, + "learning_rate": 9.40577036970538e-06, + "loss": 0.6878, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.4360734707364681, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7452, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.4338415317459866, + "learning_rate": 9.259980141081115e-06, + "loss": 0.6691, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.46791105312666936, + "learning_rate": 9.187491363342093e-06, + "loss": 0.7133, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.6443437997468915, + "learning_rate": 9.115273765538202e-06, + "loss": 0.7591, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.5840070559987589, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6346, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.5121836872551004, + "learning_rate": 8.971652971536148e-06, + "loss": 0.8103, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.42595043794524284, + "learning_rate": 8.900250204211514e-06, + "loss": 0.5986, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.6219160007133622, + "learning_rate": 8.829119474567671e-06, + "loss": 0.7577, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.44806459453518815, + "learning_rate": 8.758260995011825e-06, + "loss": 0.725, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.42680013936557165, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6499, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.4873858389555336, + "learning_rate": 8.617361631727138e-06, + "loss": 0.7275, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.90636018002638, + "learning_rate": 8.547321168745193e-06, + "loss": 0.6766, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.5276246709013431, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6333, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.4695454884594552, + "learning_rate": 8.408059725858719e-06, + "loss": 0.6974, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.48589883346143603, + "learning_rate": 8.338839161809997e-06, + "loss": 0.6775, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.5344726949607503, + "learning_rate": 8.269892311900696e-06, + "loss": 0.7416, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.4755854308811973, + "learning_rate": 8.201219382016556e-06, + "loss": 0.7098, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.561545222591497, + "learning_rate": 8.132820577225387e-06, + "loss": 0.7162, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.5080344993691123, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7329, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.4208799586754482, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6437, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.4594465001381088, + "learning_rate": 7.929270951805178e-06, + "loss": 0.6687, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.5416270281848486, + "learning_rate": 7.861970681683051e-06, + "loss": 0.7968, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.5311902983698358, + "learning_rate": 7.794945549701993e-06, + "loss": 0.763, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.518468997652413, + "learning_rate": 7.728195756009204e-06, + "loss": 0.7842, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.47316950851905404, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7328, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.4072261792424843, + "learning_rate": 7.595522979965819e-06, + "loss": 0.7069, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.4818977614864567, + "learning_rate": 7.529600393796232e-06, + "loss": 0.7396, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.43408037155400875, + "learning_rate": 7.463953938275858e-06, + "loss": 0.648, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.4554128660458298, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.6514, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.40047115756802987, + "learning_rate": 7.333490202478666e-06, + "loss": 0.7047, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.45744072060633134, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6084, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.4125268252314731, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6247, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.3556923600367426, + "learning_rate": 7.1398704525792e-06, + "loss": 0.6169, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4399392218149216, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6586, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.4537009793185571, + "learning_rate": 7.012176770311862e-06, + "loss": 0.718, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.7025833802101566, + "learning_rate": 6.948746347689183e-06, + "loss": 0.7511, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4361650334499131, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.7057, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.47627661834461577, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.7142, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.39433561933398675, + "learning_rate": 6.760123024328624e-06, + "loss": 0.7282, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4600824531990041, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6502, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.8783694858925974, + "learning_rate": 6.635765971293484e-06, + "loss": 0.7062, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.6199721366104255, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.7433, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.4291551148985123, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6791, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.5532761091198096, + "learning_rate": 6.451321849032288e-06, + "loss": 0.7314, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.4680846478382772, + "learning_rate": 6.390398932093555e-06, + "loss": 0.6969, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.6169871804556015, + "learning_rate": 6.329755547632499e-06, + "loss": 0.8083, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.4785911706625698, + "learning_rate": 6.269391876739495e-06, + "loss": 0.7172, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.44216287726421416, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6858, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.621762468829309, + "learning_rate": 6.149504395842087e-06, + "loss": 0.8043, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.4971053975514178, + "learning_rate": 6.089980943839924e-06, + "loss": 0.7076, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.499530457165973, + "learning_rate": 6.030737921409169e-06, + "loss": 0.6667, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.5551557815670419, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6791, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.5502462406809632, + "learning_rate": 5.913093872058528e-06, + "loss": 0.6928, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.4185991301854179, + "learning_rate": 5.854693196441641e-06, + "loss": 0.681, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.6102472819045014, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7735, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.4038728596972177, + "learning_rate": 5.738735415290642e-06, + "loss": 0.6997, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.6112272850635387, + "learning_rate": 5.681178656024055e-06, + "loss": 0.7906, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.6823050571999114, + "learning_rate": 5.623903547074549e-06, + "loss": 0.7393, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.48044910575690514, + "learning_rate": 5.566910259474289e-06, + "loss": 0.6921, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.5099273325096785, + "learning_rate": 5.510198963413881e-06, + "loss": 0.8236, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4322407642986071, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6794, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.44129741138352807, + "learning_rate": 5.397623022464226e-06, + "loss": 0.7084, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.4753167614877353, + "learning_rate": 5.341758713743828e-06, + "loss": 0.6671, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.5739284289831437, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6935, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.4229774415123507, + "learning_rate": 5.230878253907912e-06, + "loss": 0.6782, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.5193351176220204, + "learning_rate": 5.175862433898282e-06, + "loss": 0.6292, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.8117702174014183, + "learning_rate": 5.121129773156663e-06, + "loss": 0.9323, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.47062059840601245, + "learning_rate": 5.066680435123106e-06, + "loss": 0.7921, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.5121954255095074, + "learning_rate": 5.012514582391592e-06, + "loss": 0.7148, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4282829992510614, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6855, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.41795780800866367, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6131, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.6207093528235793, + "learning_rate": 4.851719549248301e-06, + "loss": 0.8102, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.5090173373571176, + "learning_rate": 4.798689246727006e-06, + "loss": 0.7484, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.4538782522384936, + "learning_rate": 4.745943229770122e-06, + "loss": 0.6717, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.60000436823127, + "learning_rate": 4.693481655885257e-06, + "loss": 0.8122, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4250546805274872, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6265, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.5040520980980161, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6753, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.49996306159161275, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6779, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.4712278746479505, + "learning_rate": 4.486482911479839e-06, + "loss": 0.68, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.46074573044129796, + "learning_rate": 4.435445885824285e-06, + "loss": 0.7227, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.493900195322433, + "learning_rate": 4.384694230432984e-06, + "loss": 0.6689, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4344625130464455, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6555, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.5206790882786125, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.7867, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.5289091037967014, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.7056, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4329363837209095, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6419, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.7364925592792445, + "learning_rate": 4.135221781914034e-06, + "loss": 0.7942, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.4287919792677331, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.6089, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.5987593143476608, + "learning_rate": 4.037435632986786e-06, + "loss": 0.8248, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.47424660935512686, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6991, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.4663440261234151, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.6776, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.4624155823554257, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7055, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.48475807611492766, + "learning_rate": 3.845303192289074e-06, + "loss": 0.8187, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.4494717541871752, + "learning_rate": 3.797987556970495e-06, + "loss": 0.7191, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4835044715176101, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6616, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.4838683531014212, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.8107, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.4725694544089572, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.746, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.3979184087752747, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6655, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.3868242300055193, + "learning_rate": 3.565721283350931e-06, + "loss": 0.6417, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.46778212411889863, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.7105, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4598984573387436, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.7256, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.5866239669235088, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6933, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.4543808142362355, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.706, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.44260354086610093, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6791, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.4170564242554412, + "learning_rate": 3.296506110302422e-06, + "loss": 0.639, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.4597202296846262, + "learning_rate": 3.252646840332918e-06, + "loss": 0.8017, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4529920738735563, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6999, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.657461638690797, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6636, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.41654680911541275, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.7, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.45624516917161206, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6794, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.5615950013332655, + "learning_rate": 3.037686613916857e-06, + "loss": 0.742, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.575824823799937, + "learning_rate": 2.995562691985898e-06, + "loss": 0.8335, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5046522891338086, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7059, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.6991670890182904, + "learning_rate": 2.912183982969385e-06, + "loss": 0.907, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.43206580877235, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.6466, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.44256457653611103, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6343, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.5287835942194266, + "learning_rate": 2.789290617426765e-06, + "loss": 0.7463, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.4843669519470727, + "learning_rate": 2.748906571878207e-06, + "loss": 0.6667, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.4103340157544824, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6974, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.5184298403298317, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.7487, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.6082050912128165, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.7298, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.47946874007511087, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6172, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.6112987394155805, + "learning_rate": 2.551344823532964e-06, + "loss": 0.7834, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.47366441421557115, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.6871, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5156334997582206, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6881, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.5007794686429051, + "learning_rate": 2.436298790049363e-06, + "loss": 0.6776, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.41915625038203275, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.665, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.6067230351998149, + "learning_rate": 2.3610579436393e-06, + "loss": 0.8348, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.6362295947710719, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.8087, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.47269885739172784, + "learning_rate": 2.286983355164529e-06, + "loss": 0.6545, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4550854258944117, + "learning_rate": 2.250383684694579e-06, + "loss": 0.5825, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.43939832729846506, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.6521, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.4969689643834163, + "learning_rate": 2.178060137750071e-06, + "loss": 0.6967, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.5319636056438496, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7467, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.3596476957935459, + "learning_rate": 2.106905034576112e-06, + "loss": 0.612, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.5949674960873456, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.7606, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4993480554368963, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7188, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.4356732798225739, + "learning_rate": 2.002365067264289e-06, + "loss": 0.6958, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.6594548762626951, + "learning_rate": 1.968103545249611e-06, + "loss": 0.7899, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.42673152905831363, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6451, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.4180416578744134, + "learning_rate": 1.900458817025097e-06, + "loss": 0.6719, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.4319759353989045, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.6994, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.511914801055679, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.7403, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.45594120324085713, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.6978, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.5587672330196816, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.8151, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.4672484923497262, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6433, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.4034606284395722, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.599, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.4841435384502967, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.7121, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.4086861829838414, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6631, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.5084992483479394, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.7367, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.4870556258038371, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.7428, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.4242105121906902, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6565, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.43070180276909636, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.6833, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.4769604410420578, + "learning_rate": 1.489364501100332e-06, + "loss": 0.7616, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.42477402577716145, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6216, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.4605481962020174, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6322, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.4425138437152816, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.6946, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.5096055721561739, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7539, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.5304704102228746, + "learning_rate": 1.344477780953346e-06, + "loss": 0.6634, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.5496107480681083, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.6355, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.5425962969344147, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7227, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.5243601914848159, + "learning_rate": 1.261080262743297e-06, + "loss": 0.6934, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.48066922628094294, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.7211, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.5655455844295989, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.7969, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.4283099962354034, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.6584, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.4043136244440144, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6298, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.44838038555331095, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6608, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.7564798392992993, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.6701, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.40676313098529043, + "learning_rate": 1.076809502472831e-06, + "loss": 0.6538, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4795392201498022, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.7172, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.40742487024026375, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6306, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.45877808976714857, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.734, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5743267675772135, + "learning_rate": 9.780089980330642e-07, + "loss": 0.8098, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.5722405526518496, + "learning_rate": 9.540479264726676e-07, + "loss": 0.8646, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.4610864305157812, + "learning_rate": 9.303826211592315e-07, + "loss": 0.6968, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.3874492764828429, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6075, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.622146723828427, + "learning_rate": 8.839395910626213e-07, + "loss": 0.7744, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.5160608743139206, + "learning_rate": 8.611620049653879e-07, + "loss": 0.7703, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.6041189830556892, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6936, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.5118544620614286, + "learning_rate": 8.16495030759501e-07, + "loss": 0.6941, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.5343848880071994, + "learning_rate": 7.946057760332193e-07, + "loss": 0.6392, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.49079751936485655, + "learning_rate": 7.730127636723539e-07, + "loss": 0.7257, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.4637486450647678, + "learning_rate": 7.517160581569372e-07, + "loss": 0.6565, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.35597581447012905, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6244, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5065092596901237, + "learning_rate": 7.100118211581852e-07, + "loss": 0.683, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.4674308576590083, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6693, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.5667653011508343, + "learning_rate": 6.694935631773258e-07, + "loss": 0.7204, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.48981590277493864, + "learning_rate": 6.496793281141056e-07, + "loss": 0.756, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.462165485587369, + "learning_rate": 6.301617681886863e-07, + "loss": 0.6905, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.4940514688545709, + "learning_rate": 6.109409416834688e-07, + "loss": 0.7038, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.5048306343312814, + "learning_rate": 5.920169059947411e-07, + "loss": 0.776, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.5637897209501818, + "learning_rate": 5.733897176325665e-07, + "loss": 0.8129, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.5150800518400652, + "learning_rate": 5.550594322205504e-07, + "loss": 0.7644, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.4793702040563132, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7541, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.4473462128597777, + "learning_rate": 5.192897883082747e-07, + "loss": 0.7954, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.4984617422783079, + "learning_rate": 5.018505366216175e-07, + "loss": 0.7314, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.5275269532360491, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7556, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.4168873057000073, + "learning_rate": 4.678634341683252e-07, + "loss": 0.6481, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.5696623926762724, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6942, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4851847663409578, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6983, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.47119487192422155, + "learning_rate": 4.191120373120749e-07, + "loss": 0.6302, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.5790827839290962, + "learning_rate": 4.034562351727389e-07, + "loss": 0.7274, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.398730184091536, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.5725, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.47337038159620887, + "learning_rate": 3.73036907948543e-07, + "loss": 0.6636, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.49541788249273355, + "learning_rate": 3.582734737004101e-07, + "loss": 0.7282, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.46621975936895355, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6791, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.4817558813921772, + "learning_rate": 3.296392843612273e-07, + "loss": 0.7099, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.40107644088369515, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.6479, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.4679791613158336, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.7141, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.5619103417673081, + "learning_rate": 2.889203328748424e-07, + "loss": 0.7529, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.6447959335340104, + "learning_rate": 2.759428007315212e-07, + "loss": 0.8111, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.43316735390269656, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6778, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.44721925797070633, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.7709, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.44745049139874443, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.7194, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5097152855164827, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6915, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.48789010654704096, + "learning_rate": 2.15522751523467e-07, + "loss": 0.6994, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.5431939629329696, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.6987, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4845379187086868, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6198, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.4397358122971218, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.6953, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.5341035258238761, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.7675, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.5041287666272612, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6628, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.5034487183315479, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.7357, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.41166319860577366, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.6495, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.482859318866857, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.7002, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.511971705284019, + "learning_rate": 1.255414374179531e-07, + "loss": 0.7694, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.4246436530163462, + "learning_rate": 1.170343437301491e-07, + "loss": 0.6638, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.39750627270940414, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6434, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.4285443608648601, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.7199, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.47386557717487543, + "learning_rate": 9.330275400666332e-08, + "loss": 0.7379, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.4751932346525158, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6993, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.5829683665406566, + "learning_rate": 7.8973337634336e-08, + "loss": 0.7968, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.488540059171816, + "learning_rate": 7.225618800222877e-08, + "loss": 0.7717, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4039688287508916, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6184, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.4376999076705189, + "learning_rate": 5.971710613821291e-08, + "loss": 0.7082, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.4754260162700392, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6569, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.49060827160061726, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6498, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.5345195921909358, + "learning_rate": 4.314680098592705e-08, + "loss": 0.7227, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.5193383074244806, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.7667, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.565638644612035, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7899, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.5015026943257141, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.7851, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.6270339345436331, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.8221, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4788575345151283, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7483, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.4989941715945976, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.7683, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.5400799400718792, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.8127, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.47439414857581497, + "learning_rate": 1.209367398504746e-08, + "loss": 0.7164, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.6841191889191034, + "learning_rate": 9.555535917993297e-09, + "loss": 0.8663, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.6359287469277909, + "learning_rate": 7.315984495548378e-09, + "loss": 0.94, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4492621197075347, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6509, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.5456428181444235, + "learning_rate": 3.732667443390181e-09, + "loss": 0.7309, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.4118915710082582, + "learning_rate": 2.388912514017516e-09, + "loss": 0.7186, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.46232257898409884, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6662, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.44837771621131517, + "learning_rate": 5.972299119250125e-10, + "loss": 0.5715, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.5570450286226029, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.8057, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.3937926368941584, + "learning_rate": 0.0, + "loss": 0.6504, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1530216586477568.0, + "train_loss": 0.7911407696406046, + "train_runtime": 27947.7114, + "train_samples_per_second": 1.073, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1530216586477568.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..849701cf3500261616829a852c4694138814644e --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..53a241e7eed6d1d4e53657fea6fce2bac122187a --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f5fbfaf78f2660b5a7e389bd0cabaed01836ceef82cec08ccf83f292c1b145 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..ca4541f44ba0763c9545308c17ed43f708e082c4 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d27eec83422eb89843f47c1c7465c54c9b0a49881f7bde3aa8eedd2417713e2 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..82e389d2975fa50bfab5506c7cc45bf0f9e3d15b --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9338962732953701, + "learning_rate": 2e-05, + "loss": 1.3626, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7826216851674797, + "learning_rate": 4e-05, + "loss": 1.1873, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8026087339200398, + "learning_rate": 6e-05, + "loss": 1.2351, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.8953557913806978, + "learning_rate": 8e-05, + "loss": 1.4091, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.6344660427218007, + "learning_rate": 0.0001, + "loss": 1.0512, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7817986163856506, + "learning_rate": 0.00012, + "loss": 1.0443, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.9563762281220346, + "learning_rate": 0.00014, + "loss": 1.1946, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7861431922120908, + "learning_rate": 0.00016, + "loss": 1.1313, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.8557088838865845, + "learning_rate": 0.00018, + "loss": 1.0663, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.554668877411433, + "learning_rate": 0.0002, + "loss": 0.9227, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5924202455623732, + "learning_rate": 0.00019999458931878073, + "loss": 0.8726, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5998948332967988, + "learning_rate": 0.0001999783578606323, + "loss": 0.9337, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.6523668340417098, + "learning_rate": 0.00019995130738201966, + "loss": 0.9656, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5911926471630946, + "learning_rate": 0.0001999134408101731, + "loss": 0.894, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.6985684411115631, + "learning_rate": 0.00019986476224277165, + "loss": 1.0268, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5709438499133971, + "learning_rate": 0.00019980527694749952, + "loss": 0.9594, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.7521237360681552, + "learning_rate": 0.00019973499136147606, + "loss": 0.9985, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5597775297927315, + "learning_rate": 0.0001996539130905593, + "loss": 0.8846, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.6123226618928314, + "learning_rate": 0.0001995620509085228, + "loss": 0.9999, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.6191572504342515, + "learning_rate": 0.00019945941475610623, + "loss": 0.9126, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.8294983017105749, + "learning_rate": 0.0001993460157399396, + "loss": 0.9755, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6354126575114571, + "learning_rate": 0.0001992218661313415, + "loss": 1.0064, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5424883492269076, + "learning_rate": 0.00019908697936499103, + "loss": 0.9076, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5081500779540952, + "learning_rate": 0.00019894137003747403, + "loss": 0.8983, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.692718915554635, + "learning_rate": 0.00019878505390570362, + "loss": 0.9351, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.52678134497159, + "learning_rate": 0.00019861804788521493, + "loss": 0.9046, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.547165568628004, + "learning_rate": 0.00019844037004833473, + "loss": 0.795, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5576260347576817, + "learning_rate": 0.00019825203962222572, + "loss": 0.9572, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.6056494284984847, + "learning_rate": 0.0001980530769868059, + "loss": 1.0077, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.5656232329769246, + "learning_rate": 0.00019784350367254322, + "loss": 0.9427, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5604826934003616, + "learning_rate": 0.0001976233423581255, + "loss": 0.9405, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5051155342791347, + "learning_rate": 0.0001973926168680066, + "loss": 0.911, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5019309977427193, + "learning_rate": 0.00019715135216982798, + "loss": 0.9022, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5184966962250892, + "learning_rate": 0.0001968995743717171, + "loss": 0.9031, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.4651416151244891, + "learning_rate": 0.00019663731071946206, + "loss": 0.8528, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5841766928098853, + "learning_rate": 0.00019636458959356316, + "loss": 0.9857, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.6030595623246762, + "learning_rate": 0.0001960814405061619, + "loss": 0.9255, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5559191895895258, + "learning_rate": 0.00019578789409784727, + "loss": 0.8589, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4967165228070396, + "learning_rate": 0.00019548398213434007, + "loss": 0.8613, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.5653765926413602, + "learning_rate": 0.00019516973750305532, + "loss": 0.8859, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5422013600282081, + "learning_rate": 0.00019484519420954354, + "loss": 0.8828, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5826459751875703, + "learning_rate": 0.00019451038737381077, + "loss": 0.9142, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5535975218457786, + "learning_rate": 0.00019416535322651818, + "loss": 0.7845, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5295297855307182, + "learning_rate": 0.00019381012910506146, + "loss": 0.8461, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.5026378000178681, + "learning_rate": 0.00019344475344953012, + "loss": 0.8962, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5685172681516242, + "learning_rate": 0.00019306926579854821, + "loss": 0.9051, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5175059491207624, + "learning_rate": 0.00019268370678499533, + "loss": 0.9218, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.599725245514314, + "learning_rate": 0.0001922881181316097, + "loss": 1.0399, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.6362525861936286, + "learning_rate": 0.00019188254264647337, + "loss": 0.8076, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.9030266676654153, + "learning_rate": 0.0001914670242183795, + "loss": 0.9246, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5056758352509887, + "learning_rate": 0.0001910416078120832, + "loss": 0.8316, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4744199957100715, + "learning_rate": 0.0001906063394634356, + "loss": 0.8464, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5331700094171545, + "learning_rate": 0.00019016126627440237, + "loss": 0.9717, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5265592779579443, + "learning_rate": 0.00018970643640796642, + "loss": 0.9252, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.47205781532010516, + "learning_rate": 0.000189241899082916, + "loss": 0.873, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5407857333191397, + "learning_rate": 0.00018876770456851877, + "loss": 0.9118, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5818653548795125, + "learning_rate": 0.0001882839041790818, + "loss": 0.9831, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5015172076464853, + "learning_rate": 0.00018779055026839868, + "loss": 0.8565, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5694650083854068, + "learning_rate": 0.00018728769622408423, + "loss": 0.8903, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.5972673674616251, + "learning_rate": 0.00018677539646179707, + "loss": 0.8937, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.47173773575170286, + "learning_rate": 0.00018625370641935129, + "loss": 0.8544, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.577412887943692, + "learning_rate": 0.00018572268255071718, + "loss": 0.966, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.6044123923844257, + "learning_rate": 0.00018518238231991218, + "loss": 0.8515, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5600973637310778, + "learning_rate": 0.00018463286419478255, + "loss": 0.8436, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.4361774278276434, + "learning_rate": 0.00018407418764067627, + "loss": 0.7677, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.6968930247271232, + "learning_rate": 0.00018350641311400812, + "loss": 0.9854, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.6660101095375524, + "learning_rate": 0.0001829296020557174, + "loss": 1.0091, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.557034072428792, + "learning_rate": 0.00018234381688461942, + "loss": 0.929, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.5203200972002412, + "learning_rate": 0.0001817491209906506, + "loss": 0.8478, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.5104131949732492, + "learning_rate": 0.00018114557872800905, + "loss": 0.865, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.5166062785055189, + "learning_rate": 0.00018053325540819045, + "loss": 0.909, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.49552636377658477, + "learning_rate": 0.0001799122172929206, + "loss": 0.7994, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.539523454982455, + "learning_rate": 0.00017928253158698473, + "loss": 0.9213, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5627036368958536, + "learning_rate": 0.0001786442664309554, + "loss": 0.9762, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.426658380905887, + "learning_rate": 0.0001779974908938184, + "loss": 0.778, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5399147516953222, + "learning_rate": 0.0001773422749654988, + "loss": 0.9, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5937823156558912, + "learning_rate": 0.00017667868954928694, + "loss": 0.9843, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5798363712102641, + "learning_rate": 0.00017600680645416583, + "loss": 0.9274, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.6073609011014686, + "learning_rate": 0.00017532669838704035, + "loss": 0.9492, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.5511641207054866, + "learning_rate": 0.00017463843894486937, + "loss": 0.8819, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.48773260003605995, + "learning_rate": 0.0001739421026067017, + "loss": 0.8691, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5859319345899571, + "learning_rate": 0.00017323776472561627, + "loss": 0.9468, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5041094523292482, + "learning_rate": 0.00017252550152056795, + "loss": 0.9376, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.44240497780643767, + "learning_rate": 0.0001718053900681397, + "loss": 0.8236, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.5075138902278976, + "learning_rate": 0.00017107750829420176, + "loss": 0.8903, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4319744942169732, + "learning_rate": 0.00017034193496547902, + "loss": 0.7573, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.47439141142404107, + "learning_rate": 0.00016959874968102735, + "loss": 0.8451, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5636368435856511, + "learning_rate": 0.00016884803286362, + "loss": 0.9061, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.507050223866283, + "learning_rate": 0.00016808986575104465, + "loss": 0.8482, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.466920542665784, + "learning_rate": 0.00016732433038731242, + "loss": 0.8553, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4871840557622525, + "learning_rate": 0.0001665515096137797, + "loss": 0.7571, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4864553660708385, + "learning_rate": 0.00016577148706018328, + "loss": 0.7767, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.5351561565292604, + "learning_rate": 0.00016498434713559088, + "loss": 0.9148, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.5530476011968923, + "learning_rate": 0.00016419017501926656, + "loss": 0.9433, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.5867933810167691, + "learning_rate": 0.0001633890566514535, + "loss": 0.9014, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.43778812317460986, + "learning_rate": 0.00016258107872407375, + "loss": 0.7746, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5528980307406878, + "learning_rate": 0.0001617663286713474, + "loss": 0.7819, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.5440716658453119, + "learning_rate": 0.00016094489466033043, + "loss": 0.8859, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5995595635001515, + "learning_rate": 0.00016011686558137448, + "loss": 0.9349, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.4999901123514004, + "learning_rate": 0.0001592823310385073, + "loss": 0.8627, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5823620592442721, + "learning_rate": 0.0001584413813397364, + "loss": 0.8166, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.5364205602794443, + "learning_rate": 0.00015759410748727662, + "loss": 0.9191, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5082812495075835, + "learning_rate": 0.00015674060116770236, + "loss": 0.9031, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5758084411393675, + "learning_rate": 0.00015588095474202595, + "loss": 0.8901, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.9777702250295537, + "learning_rate": 0.00015501526123570277, + "loss": 0.9929, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.6783283770702636, + "learning_rate": 0.00015414361432856475, + "loss": 1.0695, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.5271622038577951, + "learning_rate": 0.0001532661083446829, + "loss": 0.8938, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.47357167917490556, + "learning_rate": 0.00015238283824216015, + "loss": 0.8447, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.5171733446789377, + "learning_rate": 0.00015149389960285558, + "loss": 0.8556, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.5193440460225393, + "learning_rate": 0.00015059938862204127, + "loss": 0.9351, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.49741905461575303, + "learning_rate": 0.00014969940209799248, + "loss": 0.8259, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4607711393942658, + "learning_rate": 0.00014879403742151283, + "loss": 0.8085, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.5051630516846702, + "learning_rate": 0.00014788339256539544, + "loss": 0.9029, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.47151404685858905, + "learning_rate": 0.0001469675660738206, + "loss": 0.8433, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.4444990889893979, + "learning_rate": 0.00014604665705169237, + "loss": 0.8156, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4812296562563869, + "learning_rate": 0.00014512076515391375, + "loss": 0.8422, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.6292710246476524, + "learning_rate": 0.00014418999057460276, + "loss": 1.0499, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5310747533883055, + "learning_rate": 0.0001432544340362501, + "loss": 0.8492, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.6780462054568485, + "learning_rate": 0.00014231419677881966, + "loss": 1.0863, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.4102062485457246, + "learning_rate": 0.00014136938054879283, + "loss": 0.7337, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5575101890539665, + "learning_rate": 0.00014042008758815818, + "loss": 0.9218, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.5170278022858484, + "learning_rate": 0.00013946642062334766, + "loss": 0.8053, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4748794796067836, + "learning_rate": 0.00013850848285411994, + "loss": 0.7251, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.6963277572293579, + "learning_rate": 0.000137546377942393, + "loss": 0.9772, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.538275454501302, + "learning_rate": 0.00013658021000102636, + "loss": 0.921, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.6309655795674255, + "learning_rate": 0.00013561008358255468, + "loss": 0.9811, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.5345156432452945, + "learning_rate": 0.00013463610366787392, + "loss": 0.8148, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.5686942164334279, + "learning_rate": 0.00013365837565488064, + "loss": 0.9561, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.578774730515947, + "learning_rate": 0.0001326770053470668, + "loss": 0.9446, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.519069595704834, + "learning_rate": 0.0001316920989420703, + "loss": 0.9614, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.5799529222159874, + "learning_rate": 0.00013070376302018287, + "loss": 0.9756, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5654645995189487, + "learning_rate": 0.00012971210453281674, + "loss": 0.8901, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.44235999598674136, + "learning_rate": 0.000128717230790931, + "loss": 0.6974, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.5103462719597298, + "learning_rate": 0.00012771924945341906, + "loss": 0.8175, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.44160518330831844, + "learning_rate": 0.00012671826851545851, + "loss": 0.7435, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.5304405430077167, + "learning_rate": 0.0001257143962968246, + "loss": 0.8126, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.5066854976154387, + "learning_rate": 0.00012470774143016853, + "loss": 0.878, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.5771635476196476, + "learning_rate": 0.00012369841284926188, + "loss": 0.8844, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4815512394978152, + "learning_rate": 0.00012268651977720866, + "loss": 0.8733, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.4746860027433025, + "learning_rate": 0.00012167217171462566, + "loss": 0.8286, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4757866033805745, + "learning_rate": 0.0001206554784277931, + "loss": 0.8025, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.5967798401225021, + "learning_rate": 0.00011963654993677645, + "loss": 0.9161, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4709504374032868, + "learning_rate": 0.00011861549650352069, + "loss": 0.7988, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4952465381629514, + "learning_rate": 0.00011759242861991855, + "loss": 0.8296, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.48452681690806687, + "learning_rate": 0.00011656745699585371, + "loss": 0.8093, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.40245520027582504, + "learning_rate": 0.00011554069254722051, + "loss": 0.747, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.5140200022581295, + "learning_rate": 0.00011451224638392129, + "loss": 0.9039, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4499258564105942, + "learning_rate": 0.00011348222979784289, + "loss": 0.8334, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.511561509908632, + "learning_rate": 0.00011245075425081328, + "loss": 0.9037, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.485439072348685, + "learning_rate": 0.00011141793136253986, + "loss": 0.7892, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4506842974593977, + "learning_rate": 0.0001103838728985307, + "loss": 0.8416, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4448082741644203, + "learning_rate": 0.000109348690758, + "loss": 0.7594, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5620993724610065, + "learning_rate": 0.00010831249696175918, + "loss": 0.8232, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4547643790391853, + "learning_rate": 0.0001072754036400944, + "loss": 0.8064, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.4272377788377005, + "learning_rate": 0.00010623752302063283, + "loss": 0.7958, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.39687457921679736, + "learning_rate": 0.00010519896741619803, + "loss": 0.7649, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.572994650249389, + "learning_rate": 0.00010415984921265609, + "loss": 0.9101, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4377518611761136, + "learning_rate": 0.00010312028085675391, + "loss": 0.7508, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.556986243572138, + "learning_rate": 0.00010208037484395114, + "loss": 0.8583, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.5101680561816191, + "learning_rate": 0.00010104024370624644, + "loss": 0.8799, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.48972383440758366, + "learning_rate": 0.0001, + "loss": 0.7847, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.444655021704483, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8255, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5474795246146188, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7948, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4943924330481842, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7877, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.46711979787555513, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8394, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5170746572254186, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8582, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.5029829766987036, + "learning_rate": 9.376247697936719e-05, + "loss": 0.823, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.5138577160309612, + "learning_rate": 9.272459635990562e-05, + "loss": 0.8544, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5702185113451829, + "learning_rate": 9.168750303824084e-05, + "loss": 0.8641, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.5424895654513993, + "learning_rate": 9.065130924199998e-05, + "loss": 0.8742, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5012647906278516, + "learning_rate": 8.961612710146934e-05, + "loss": 0.8502, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.5609989577409452, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8781, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.5198376762185971, + "learning_rate": 8.754924574918675e-05, + "loss": 0.9241, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4794985714870261, + "learning_rate": 8.651777020215712e-05, + "loss": 0.8599, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.5814781940803141, + "learning_rate": 8.548775361607872e-05, + "loss": 0.9525, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4775512630920124, + "learning_rate": 8.445930745277953e-05, + "loss": 0.8309, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.5308946896737243, + "learning_rate": 8.343254300414628e-05, + "loss": 0.8173, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.6156962692255327, + "learning_rate": 8.240757138008149e-05, + "loss": 0.9557, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4608465291703767, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7859, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.46533608765721235, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8186, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.41599063489185295, + "learning_rate": 7.934452157220694e-05, + "loss": 0.7726, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.503097645984635, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8818, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.44746089254838045, + "learning_rate": 7.731348022279134e-05, + "loss": 0.8175, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4726203165481472, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8099, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.6634031625087123, + "learning_rate": 7.52922585698315e-05, + "loss": 0.9609, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4572154396747571, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8348, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3567945592222823, + "learning_rate": 7.328173148454151e-05, + "loss": 0.6537, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4585283897877737, + "learning_rate": 7.228075054658096e-05, + "loss": 0.8717, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.6248603786599073, + "learning_rate": 7.1282769209069e-05, + "loss": 0.9682, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.7495566278542247, + "learning_rate": 7.028789546718326e-05, + "loss": 0.8608, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4420718156712989, + "learning_rate": 6.929623697981718e-05, + "loss": 0.787, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.5111917235368519, + "learning_rate": 6.830790105792973e-05, + "loss": 0.9175, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.672438508212136, + "learning_rate": 6.732299465293322e-05, + "loss": 1.0719, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.5116568664709935, + "learning_rate": 6.63416243451194e-05, + "loss": 0.8358, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.5203643567272366, + "learning_rate": 6.536389633212609e-05, + "loss": 0.9163, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.38870695570150277, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7233, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.48407545472042224, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7864, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4947500456056011, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7789, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.45109916588292803, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7759, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.40198365763235006, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7593, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.6390697363623546, + "learning_rate": 5.957991241184184e-05, + "loss": 0.9247, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.41646129282270006, + "learning_rate": 5.863061945120719e-05, + "loss": 0.765, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.5981197840829817, + "learning_rate": 5.768580322118034e-05, + "loss": 0.9303, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.48170462062771713, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.9285, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.600508673442443, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.9781, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.716824076932732, + "learning_rate": 5.487923484608629e-05, + "loss": 0.8484, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4414738931347693, + "learning_rate": 5.395334294830765e-05, + "loss": 0.8276, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.48978516953737894, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.8365, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.471248437485331, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7754, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.47196979483999146, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7898, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.5777611166728946, + "learning_rate": 5.030059790200756e-05, + "loss": 0.8558, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5703589955979588, + "learning_rate": 4.940061137795876e-05, + "loss": 0.9313, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.47223463059194537, + "learning_rate": 4.850610039714444e-05, + "loss": 0.8815, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5333634484913804, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8196, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.4788802228974212, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7387, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.43697629339971344, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8289, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4259464304601749, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7418, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.7166274928047626, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7819, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.458133618567669, + "learning_rate": 4.325939883229766e-05, + "loss": 0.8053, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.46702827888913045, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8292, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.48946019781873046, + "learning_rate": 4.155861866026364e-05, + "loss": 0.864, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.42289219273319145, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7392, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5862973098247786, + "learning_rate": 3.988313441862553e-05, + "loss": 0.8668, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4081715394623093, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7641, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.5725023324849362, + "learning_rate": 3.823367132865265e-05, + "loss": 0.9315, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4608176923491287, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7366, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.8938209745291172, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7719, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4999641933480804, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7636, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4888845540498063, + "learning_rate": 3.501565286440914e-05, + "loss": 0.7827, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.6261751303771892, + "learning_rate": 3.422851293981676e-05, + "loss": 0.8178, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.534356559184315, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.9203, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.48992129201536144, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7421, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 1.9767206209015142, + "learning_rate": 3.191013424895536e-05, + "loss": 0.8938, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5072459343961812, + "learning_rate": 3.115196713638e-05, + "loss": 0.8005, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.5304781205118109, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7819, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4715227659427893, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7934, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5006365528434477, + "learning_rate": 2.892249170579826e-05, + "loss": 0.8368, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4752713961405422, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7956, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4642589867124423, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7403, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.4992929327151965, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.8374, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.5094978394096521, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7637, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4381104288206429, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7848, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5460531829956119, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.9106, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.49516879173004186, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8651, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.5256022998581887, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.8802, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5380446125266448, + "learning_rate": 2.265772503450122e-05, + "loss": 0.8876, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.5540982076688425, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.8525, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.45215703073694374, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.8408, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5018383080597864, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.8539, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.38150102790816764, + "learning_rate": 2.008778270707944e-05, + "loss": 0.6628, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5080972505410488, + "learning_rate": 1.946674459180955e-05, + "loss": 0.8006, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.5340085192497882, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.8429, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.49340574996400083, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.8355, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.45743507211166096, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.7672, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.4443719898917557, + "learning_rate": 1.707039794428259e-05, + "loss": 0.801, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.48772267575996897, + "learning_rate": 1.649358688599191e-05, + "loss": 0.841, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4701443478298068, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.8341, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.49557956937738107, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.798, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4841522849880639, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7349, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.4338429266316266, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7883, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5805364761275487, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7449, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.42227066125837287, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7576, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3803044681673021, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7592, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4688417390454173, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7686, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.5236357934238897, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.8839, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4956689763739436, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7857, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.4804928599152176, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.8408, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.4728394490027128, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.8468, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5210750785762072, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7312, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.5491370018515945, + "learning_rate": 9.393660536564408e-06, + "loss": 0.8933, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.5730445786288376, + "learning_rate": 8.958392187916841e-06, + "loss": 0.9124, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.3985859115466495, + "learning_rate": 8.532975781620512e-06, + "loss": 0.5921, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4964854620626818, + "learning_rate": 8.117457353526625e-06, + "loss": 0.8046, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4720944108820016, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7467, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.4789031946168028, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.8, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.5470683104324127, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7722, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.5130628373845342, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7779, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.5066102428875189, + "learning_rate": 6.189870894938587e-06, + "loss": 0.8747, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.5507520276541336, + "learning_rate": 5.834646773481811e-06, + "loss": 0.8001, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.4833001988157889, + "learning_rate": 5.489612626189245e-06, + "loss": 0.8805, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.5068186552958318, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7485, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.36968280601492154, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7415, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.37324077086556595, + "learning_rate": 4.516017865659949e-06, + "loss": 0.6866, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.5088856898732558, + "learning_rate": 4.21210590215273e-06, + "loss": 0.8358, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.47327719164581333, + "learning_rate": 3.918559493838114e-06, + "loss": 0.8017, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.48267264651946334, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7555, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.48571022308371015, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7771, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4463965041878119, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8369, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5569870129168122, + "learning_rate": 2.848647830172024e-06, + "loss": 0.8017, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.44467112798772657, + "learning_rate": 2.607383131993424e-06, + "loss": 0.8143, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5085062546038296, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7453, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3821873068095259, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7297, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.48984798074690705, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7694, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.6502396395599714, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.9617, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.4375290928151512, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7478, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.5455126380715353, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7853, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.45340283471262555, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7779, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.5245684522683273, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7451, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.45518937318494923, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7914, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.5603573831198544, + "learning_rate": 7.781338686584927e-07, + "loss": 0.9203, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.6514500429942826, + "learning_rate": 6.539842600603918e-07, + "loss": 0.8584, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4977090458616739, + "learning_rate": 5.405852438937764e-07, + "loss": 0.8083, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.5298204260092336, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.8459, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.46824487234539475, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7343, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.4361963857398833, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7546, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5574599716334647, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8743, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.53845101996606, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.9178, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4454088008265816, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7039, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.467478344259279, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.6734, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.4817562231095009, + "learning_rate": 2.164213936770576e-08, + "loss": 0.778, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5080354741119012, + "learning_rate": 5.410681219286673e-09, + "loss": 0.8394, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.49317028344655706, + "learning_rate": 0.0, + "loss": 0.8243, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 255672274780160.0, + "train_loss": 0.8613840270882998, + "train_runtime": 4649.5942, + "train_samples_per_second": 1.075, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 255672274780160.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dbf8c3c3e43325d0a92ae30695d75e92666a357a --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "q_proj", + "k_proj", + "v_proj", + "down_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec3affd8fee0cf8124510977cd932079f312c022 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f33d930eef25650a7f2de7b5a5ed4a089a1ef9ea2546996393d02cede2fe9434 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..be5855c6294be41aee73f584693169fac188ed25 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a7e9bfa0bcd8b06c4047a796844f01f5e1fb8e3bb3b0e071a6e0016f2957def +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..080e42279da6136bad733ba671a2b6e5690ab3bd --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,1134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064, + "grad_norm": 0.7189928192176996, + "learning_rate": 4e-05, + "loss": 1.2749, + "step": 1 + }, + { + "epoch": 0.0128, + "grad_norm": 0.8319955833461842, + "learning_rate": 8e-05, + "loss": 1.3789, + "step": 2 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5809754662869122, + "learning_rate": 0.00012, + "loss": 1.1343, + "step": 3 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6245866824469333, + "learning_rate": 0.00016, + "loss": 1.2149, + "step": 4 + }, + { + "epoch": 0.032, + "grad_norm": 0.9578086622724321, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 5 + }, + { + "epoch": 0.0384, + "grad_norm": 0.7566112530213828, + "learning_rate": 0.0001999783578606323, + "loss": 0.9641, + "step": 6 + }, + { + "epoch": 0.0448, + "grad_norm": 0.7522029468406204, + "learning_rate": 0.0001999134408101731, + "loss": 0.957, + "step": 7 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4688807143738283, + "learning_rate": 0.00019980527694749952, + "loss": 1.0045, + "step": 8 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4630914459649483, + "learning_rate": 0.0001996539130905593, + "loss": 0.9609, + "step": 9 + }, + { + "epoch": 0.064, + "grad_norm": 0.4803070483118325, + "learning_rate": 0.00019945941475610623, + "loss": 0.9719, + "step": 10 + }, + { + "epoch": 0.0704, + "grad_norm": 1.5275185100654902, + "learning_rate": 0.0001992218661313415, + "loss": 1.0156, + "step": 11 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5104871457960992, + "learning_rate": 0.00019894137003747403, + "loss": 0.9208, + "step": 12 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5223297671798076, + "learning_rate": 0.00019861804788521493, + "loss": 0.939, + "step": 13 + }, + { + "epoch": 0.0896, + "grad_norm": 0.410020402911693, + "learning_rate": 0.00019825203962222572, + "loss": 0.881, + "step": 14 + }, + { + "epoch": 0.096, + "grad_norm": 0.46121606847779834, + "learning_rate": 0.00019784350367254322, + "loss": 0.9898, + "step": 15 + }, + { + "epoch": 0.1024, + "grad_norm": 0.43133386304573057, + "learning_rate": 0.0001973926168680066, + "loss": 0.9396, + "step": 16 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4242170622362895, + "learning_rate": 0.0001968995743717171, + "loss": 0.9156, + "step": 17 + }, + { + "epoch": 0.1152, + "grad_norm": 0.39417215473442285, + "learning_rate": 0.00019636458959356316, + "loss": 0.9156, + "step": 18 + }, + { + "epoch": 0.1216, + "grad_norm": 0.39962940402425023, + "learning_rate": 0.00019578789409784727, + "loss": 0.8878, + "step": 19 + }, + { + "epoch": 0.128, + "grad_norm": 0.39626912636543593, + "learning_rate": 0.00019516973750305532, + "loss": 0.8733, + "step": 20 + }, + { + "epoch": 0.1344, + "grad_norm": 0.39424977571732966, + "learning_rate": 0.00019451038737381077, + "loss": 0.9044, + "step": 21 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4121165256946469, + "learning_rate": 0.00019381012910506146, + "loss": 0.818, + "step": 22 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3964086386880975, + "learning_rate": 0.00019306926579854821, + "loss": 0.9001, + "step": 23 + }, + { + "epoch": 0.1536, + "grad_norm": 0.47469443270579836, + "learning_rate": 0.0001922881181316097, + "loss": 0.9823, + "step": 24 + }, + { + "epoch": 0.16, + "grad_norm": 0.5560818463742494, + "learning_rate": 0.0001914670242183795, + "loss": 0.8658, + "step": 25 + }, + { + "epoch": 0.1664, + "grad_norm": 0.39498178221575936, + "learning_rate": 0.0001906063394634356, + "loss": 0.8386, + "step": 26 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3986620736028704, + "learning_rate": 0.00018970643640796642, + "loss": 0.9344, + "step": 27 + }, + { + "epoch": 0.1792, + "grad_norm": 0.37157498518297744, + "learning_rate": 0.00018876770456851877, + "loss": 0.8847, + "step": 28 + }, + { + "epoch": 0.1856, + "grad_norm": 0.42098958243287943, + "learning_rate": 0.00018779055026839868, + "loss": 0.9231, + "step": 29 + }, + { + "epoch": 0.192, + "grad_norm": 0.3831312726488654, + "learning_rate": 0.00018677539646179707, + "loss": 0.8833, + "step": 30 + }, + { + "epoch": 0.1984, + "grad_norm": 0.5491328097502954, + "learning_rate": 0.00018572268255071718, + "loss": 0.905, + "step": 31 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4532999921906355, + "learning_rate": 0.00018463286419478255, + "loss": 0.8356, + "step": 32 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4442471402562814, + "learning_rate": 0.00018350641311400812, + "loss": 0.8659, + "step": 33 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4626002552002207, + "learning_rate": 0.00018234381688461942, + "loss": 0.9668, + "step": 34 + }, + { + "epoch": 0.224, + "grad_norm": 0.3658469189500881, + "learning_rate": 0.00018114557872800905, + "loss": 0.8536, + "step": 35 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3770688330178575, + "learning_rate": 0.0001799122172929206, + "loss": 0.8451, + "step": 36 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4180091117995455, + "learning_rate": 0.0001786442664309554, + "loss": 0.9449, + "step": 37 + }, + { + "epoch": 0.2432, + "grad_norm": 0.35845299468910635, + "learning_rate": 0.0001773422749654988, + "loss": 0.8327, + "step": 38 + }, + { + "epoch": 0.2496, + "grad_norm": 0.44128783289492535, + "learning_rate": 0.00017600680645416583, + "loss": 0.9457, + "step": 39 + }, + { + "epoch": 0.256, + "grad_norm": 0.4449729478342089, + "learning_rate": 0.00017463843894486937, + "loss": 0.9068, + "step": 40 + }, + { + "epoch": 0.2624, + "grad_norm": 0.36216797916888466, + "learning_rate": 0.00017323776472561627, + "loss": 0.8932, + "step": 41 + }, + { + "epoch": 0.2688, + "grad_norm": 0.34442983699908675, + "learning_rate": 0.0001718053900681397, + "loss": 0.8737, + "step": 42 + }, + { + "epoch": 0.2752, + "grad_norm": 0.33788923995084713, + "learning_rate": 0.00017034193496547902, + "loss": 0.8122, + "step": 43 + }, + { + "epoch": 0.2816, + "grad_norm": 0.36266886627144307, + "learning_rate": 0.00016884803286362, + "loss": 0.8654, + "step": 44 + }, + { + "epoch": 0.288, + "grad_norm": 0.35251625422571603, + "learning_rate": 0.00016732433038731242, + "loss": 0.8452, + "step": 45 + }, + { + "epoch": 0.2944, + "grad_norm": 0.35230681149405163, + "learning_rate": 0.00016577148706018328, + "loss": 0.7604, + "step": 46 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3975760127284591, + "learning_rate": 0.00016419017501926656, + "loss": 0.9214, + "step": 47 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3925040831760101, + "learning_rate": 0.00016258107872407375, + "loss": 0.823, + "step": 48 + }, + { + "epoch": 0.3136, + "grad_norm": 0.38400758684073305, + "learning_rate": 0.00016094489466033043, + "loss": 0.8198, + "step": 49 + }, + { + "epoch": 0.32, + "grad_norm": 0.38939087705052644, + "learning_rate": 0.0001592823310385073, + "loss": 0.8895, + "step": 50 + }, + { + "epoch": 0.3264, + "grad_norm": 0.39585944742768486, + "learning_rate": 0.00015759410748727662, + "loss": 0.8584, + "step": 51 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3987988094773504, + "learning_rate": 0.00015588095474202595, + "loss": 0.8851, + "step": 52 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4572091526450788, + "learning_rate": 0.00015414361432856475, + "loss": 1.0262, + "step": 53 + }, + { + "epoch": 0.3456, + "grad_norm": 0.44204532283729686, + "learning_rate": 0.00015238283824216015, + "loss": 0.858, + "step": 54 + }, + { + "epoch": 0.352, + "grad_norm": 0.3763897025577905, + "learning_rate": 0.00015059938862204127, + "loss": 0.8853, + "step": 55 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3336798916730397, + "learning_rate": 0.00014879403742151283, + "loss": 0.8065, + "step": 56 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3457997656711169, + "learning_rate": 0.0001469675660738206, + "loss": 0.8587, + "step": 57 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3428806539910002, + "learning_rate": 0.00014512076515391375, + "loss": 0.8251, + "step": 58 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4194432756457435, + "learning_rate": 0.0001432544340362501, + "loss": 0.942, + "step": 59 + }, + { + "epoch": 0.384, + "grad_norm": 0.3966009547130233, + "learning_rate": 0.00014136938054879283, + "loss": 0.9017, + "step": 60 + }, + { + "epoch": 0.3904, + "grad_norm": 0.38883996037217816, + "learning_rate": 0.00013946642062334766, + "loss": 0.8532, + "step": 61 + }, + { + "epoch": 0.3968, + "grad_norm": 0.38206215602078847, + "learning_rate": 0.000137546377942393, + "loss": 0.8401, + "step": 62 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3991871520979649, + "learning_rate": 0.00013561008358255468, + "loss": 0.9448, + "step": 63 + }, + { + "epoch": 0.4096, + "grad_norm": 0.37035665888756364, + "learning_rate": 0.00013365837565488064, + "loss": 0.8757, + "step": 64 + }, + { + "epoch": 0.416, + "grad_norm": 0.3758459959811213, + "learning_rate": 0.0001316920989420703, + "loss": 0.9447, + "step": 65 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4454172241386201, + "learning_rate": 0.00012971210453281674, + "loss": 0.9232, + "step": 66 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3888728546220646, + "learning_rate": 0.00012771924945341906, + "loss": 0.7479, + "step": 67 + }, + { + "epoch": 0.4352, + "grad_norm": 0.36157253364464487, + "learning_rate": 0.0001257143962968246, + "loss": 0.7748, + "step": 68 + }, + { + "epoch": 0.4416, + "grad_norm": 0.36980532953207046, + "learning_rate": 0.00012369841284926188, + "loss": 0.8703, + "step": 69 + }, + { + "epoch": 0.448, + "grad_norm": 0.3626868146678862, + "learning_rate": 0.00012167217171462566, + "loss": 0.8467, + "step": 70 + }, + { + "epoch": 0.4544, + "grad_norm": 0.38649257959123245, + "learning_rate": 0.00011963654993677645, + "loss": 0.8564, + "step": 71 + }, + { + "epoch": 0.4608, + "grad_norm": 0.35396765028989813, + "learning_rate": 0.00011759242861991855, + "loss": 0.8104, + "step": 72 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3286780911483914, + "learning_rate": 0.00011554069254722051, + "loss": 0.7684, + "step": 73 + }, + { + "epoch": 0.4736, + "grad_norm": 0.35107142828820026, + "learning_rate": 0.00011348222979784289, + "loss": 0.8605, + "step": 74 + }, + { + "epoch": 0.48, + "grad_norm": 0.7889957756330359, + "learning_rate": 0.00011141793136253986, + "loss": 0.841, + "step": 75 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3187277828580531, + "learning_rate": 0.000109348690758, + "loss": 0.7969, + "step": 76 + }, + { + "epoch": 0.4928, + "grad_norm": 0.38348046342297426, + "learning_rate": 0.0001072754036400944, + "loss": 0.813, + "step": 77 + }, + { + "epoch": 0.4992, + "grad_norm": 0.32316080082896587, + "learning_rate": 0.00010519896741619803, + "loss": 0.7801, + "step": 78 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3913372773736484, + "learning_rate": 0.00010312028085675391, + "loss": 0.8262, + "step": 79 + }, + { + "epoch": 0.512, + "grad_norm": 0.3935015318770364, + "learning_rate": 0.00010104024370624644, + "loss": 0.8697, + "step": 80 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3614006773996832, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8044, + "step": 81 + }, + { + "epoch": 0.5248, + "grad_norm": 0.41466269228270225, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7944, + "step": 82 + }, + { + "epoch": 0.5312, + "grad_norm": 0.35402314029575577, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8434, + "step": 83 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3643060985822875, + "learning_rate": 9.272459635990562e-05, + "loss": 0.8359, + "step": 84 + }, + { + "epoch": 0.544, + "grad_norm": 0.4063956161169547, + "learning_rate": 9.065130924199998e-05, + "loss": 0.8702, + "step": 85 + }, + { + "epoch": 0.5504, + "grad_norm": 0.38678963629589486, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8657, + "step": 86 + }, + { + "epoch": 0.5568, + "grad_norm": 0.39619346949074186, + "learning_rate": 8.651777020215712e-05, + "loss": 0.8951, + "step": 87 + }, + { + "epoch": 0.5632, + "grad_norm": 0.39717583396608747, + "learning_rate": 8.445930745277953e-05, + "loss": 0.8918, + "step": 88 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4461455486472923, + "learning_rate": 8.240757138008149e-05, + "loss": 0.8901, + "step": 89 + }, + { + "epoch": 0.576, + "grad_norm": 0.34602688483693744, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8002, + "step": 90 + }, + { + "epoch": 0.5824, + "grad_norm": 0.34393240692217825, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8235, + "step": 91 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3359041764783406, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8082, + "step": 92 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4279331859091739, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8953, + "step": 93 + }, + { + "epoch": 0.6016, + "grad_norm": 0.30882762174007244, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7623, + "step": 94 + }, + { + "epoch": 0.608, + "grad_norm": 0.4584337001666346, + "learning_rate": 7.028789546718326e-05, + "loss": 0.9175, + "step": 95 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3639832701678242, + "learning_rate": 6.830790105792973e-05, + "loss": 0.8502, + "step": 96 + }, + { + "epoch": 0.6208, + "grad_norm": 0.5282506037927764, + "learning_rate": 6.63416243451194e-05, + "loss": 0.9513, + "step": 97 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3383739993204207, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8183, + "step": 98 + }, + { + "epoch": 0.6336, + "grad_norm": 0.36540908539257766, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7855, + "step": 99 + }, + { + "epoch": 0.64, + "grad_norm": 0.31612659924309955, + "learning_rate": 6.053357937665237e-05, + "loss": 0.768, + "step": 100 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4469664272821099, + "learning_rate": 5.863061945120719e-05, + "loss": 0.8475, + "step": 101 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3945021235718483, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.9279, + "step": 102 + }, + { + "epoch": 0.6592, + "grad_norm": 0.44505629821008874, + "learning_rate": 5.487923484608629e-05, + "loss": 0.9159, + "step": 103 + }, + { + "epoch": 0.6656, + "grad_norm": 0.34264801888493884, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.8375, + "step": 104 + }, + { + "epoch": 0.672, + "grad_norm": 0.3475400842702629, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7859, + "step": 105 + }, + { + "epoch": 0.6784, + "grad_norm": 0.42311657763812843, + "learning_rate": 4.940061137795876e-05, + "loss": 0.8983, + "step": 106 + }, + { + "epoch": 0.6848, + "grad_norm": 0.39636955055988876, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8531, + "step": 107 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3448189962427086, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7882, + "step": 108 + }, + { + "epoch": 0.6976, + "grad_norm": 0.32058411092475675, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7672, + "step": 109 + }, + { + "epoch": 0.704, + "grad_norm": 0.3395674575435856, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8186, + "step": 110 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3891736218202886, + "learning_rate": 4.071766896149273e-05, + "loss": 0.8075, + "step": 111 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3542055992556126, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.816, + "step": 112 + }, + { + "epoch": 0.7232, + "grad_norm": 0.38471493444035987, + "learning_rate": 3.741892127592625e-05, + "loss": 0.839, + "step": 113 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3386165881784285, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7718, + "step": 114 + }, + { + "epoch": 0.736, + "grad_norm": 0.4143057178990273, + "learning_rate": 3.422851293981676e-05, + "loss": 0.8071, + "step": 115 + }, + { + "epoch": 0.7424, + "grad_norm": 0.38135232392364804, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.8378, + "step": 116 + }, + { + "epoch": 0.7488, + "grad_norm": 0.40085764890401165, + "learning_rate": 3.115196713638e-05, + "loss": 0.8594, + "step": 117 + }, + { + "epoch": 0.7552, + "grad_norm": 0.34240983952898807, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7909, + "step": 118 + }, + { + "epoch": 0.7616, + "grad_norm": 0.34965884048536067, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.8224, + "step": 119 + }, + { + "epoch": 0.768, + "grad_norm": 0.34677921296903913, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7951, + "step": 120 + }, + { + "epoch": 0.7744, + "grad_norm": 0.33923871553660206, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7814, + "step": 121 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3638025165502641, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8955, + "step": 122 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3819803215786024, + "learning_rate": 2.265772503450122e-05, + "loss": 0.8912, + "step": 123 + }, + { + "epoch": 0.7936, + "grad_norm": 0.37494607520697026, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.8524, + "step": 124 + }, + { + "epoch": 0.8, + "grad_norm": 0.3378930714935849, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7661, + "step": 125 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3613187167368175, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.8269, + "step": 126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3494977186160372, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.8133, + "step": 127 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3415589265976714, + "learning_rate": 1.649358688599191e-05, + "loss": 0.8294, + "step": 128 + }, + { + "epoch": 0.8256, + "grad_norm": 0.35122133620906343, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.8206, + "step": 129 + }, + { + "epoch": 0.832, + "grad_norm": 0.3304259243827347, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.766, + "step": 130 + }, + { + "epoch": 0.8384, + "grad_norm": 0.383219017788973, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7598, + "step": 131 + }, + { + "epoch": 0.8448, + "grad_norm": 0.31758087504525645, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7718, + "step": 132 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3662500111889786, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.8434, + "step": 133 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3466527871535633, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.8492, + "step": 134 + }, + { + "epoch": 0.864, + "grad_norm": 0.39455671394350805, + "learning_rate": 9.393660536564408e-06, + "loss": 0.8239, + "step": 135 + }, + { + "epoch": 0.8704, + "grad_norm": 0.360736850626209, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7559, + "step": 136 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4631240715144929, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7805, + "step": 137 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3980967242407756, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7975, + "step": 138 + }, + { + "epoch": 0.8896, + "grad_norm": 0.36993486192618463, + "learning_rate": 6.189870894938587e-06, + "loss": 0.8351, + "step": 139 + }, + { + "epoch": 0.896, + "grad_norm": 0.3912263833359602, + "learning_rate": 5.489612626189245e-06, + "loss": 0.8489, + "step": 140 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3150082668830467, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7527, + "step": 141 + }, + { + "epoch": 0.9088, + "grad_norm": 0.32796401994936486, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7648, + "step": 142 + }, + { + "epoch": 0.9152, + "grad_norm": 0.343024029147899, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7889, + "step": 143 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3355843506617599, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8135, + "step": 144 + }, + { + "epoch": 0.928, + "grad_norm": 0.36927653189183046, + "learning_rate": 2.607383131993424e-06, + "loss": 0.8176, + "step": 145 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3325175287450438, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.745, + "step": 146 + }, + { + "epoch": 0.9408, + "grad_norm": 0.43019678727654465, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.872, + "step": 147 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3576768522844223, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7748, + "step": 148 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3463866549168474, + "learning_rate": 1.05862996252597e-06, + "loss": 0.767, + "step": 149 + }, + { + "epoch": 0.96, + "grad_norm": 0.3724385710741351, + "learning_rate": 7.781338686584927e-07, + "loss": 0.8617, + "step": 150 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4027111260210624, + "learning_rate": 5.405852438937764e-07, + "loss": 0.8413, + "step": 151 + }, + { + "epoch": 0.9728, + "grad_norm": 0.34338867544992413, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7988, + "step": 152 + }, + { + "epoch": 0.9792, + "grad_norm": 0.35661896137582483, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8237, + "step": 153 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3572073678993763, + "learning_rate": 8.655918982689581e-08, + "loss": 0.8177, + "step": 154 + }, + { + "epoch": 0.992, + "grad_norm": 0.35003409191038015, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7333, + "step": 155 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3632903263929621, + "learning_rate": 0.0, + "loss": 0.8419, + "step": 156 + }, + { + "epoch": 0.9984, + "step": 156, + "total_flos": 369874054676480.0, + "train_loss": 0.8646604896355898, + "train_runtime": 4609.8811, + "train_samples_per_second": 1.085, + "train_steps_per_second": 0.034 + } + ], + "logging_steps": 1.0, + "max_steps": 156, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 369874054676480.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a3249eb3c9cfb22b7b20ea510c3eef425094ff9 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..529a874bcaea7ebc9ccf5de40bd31cbc029a6c9c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:303c2be5e4b7c4c2740bc8fe21a16489e7dc9ad5fd21459fc665aff86b5faf5d +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..8013d9d088f7945a8538d7a44bfa35198708b62c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5389b9cc76ead40e1b7bc8b626a9b166a9dbb13ed161fd6c34d3e8d88509c73 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7418f65869cf29f8d6504adbb47bcf1aa5fabae8 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.856571687971295, + "learning_rate": 2e-05, + "loss": 1.2847, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7055923454015887, + "learning_rate": 4e-05, + "loss": 1.112, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7337135243739601, + "learning_rate": 6e-05, + "loss": 1.2254, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.837585495614336, + "learning_rate": 8e-05, + "loss": 1.3822, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.6594271990911063, + "learning_rate": 0.0001, + "loss": 1.1207, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.1223985314428306, + "learning_rate": 0.00012, + "loss": 0.9881, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.9227094007590665, + "learning_rate": 0.00014, + "loss": 1.1223, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.8477658360206024, + "learning_rate": 0.00016, + "loss": 1.1024, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 1.0168041710695876, + "learning_rate": 0.00018, + "loss": 0.9368, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.7268498717360209, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5215962705907371, + "learning_rate": 0.00019999458931878073, + "loss": 0.9259, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.7553183446752619, + "learning_rate": 0.0001999783578606323, + "loss": 1.0669, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5641728211000794, + "learning_rate": 0.00019995130738201966, + "loss": 0.9391, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.8498327143480147, + "learning_rate": 0.0001999134408101731, + "loss": 1.0197, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.7717724273877493, + "learning_rate": 0.00019986476224277165, + "loss": 0.9622, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5153040789801788, + "learning_rate": 0.00019980527694749952, + "loss": 0.8716, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.6215554177155925, + "learning_rate": 0.00019973499136147606, + "loss": 0.9396, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.6368311748731851, + "learning_rate": 0.0001996539130905593, + "loss": 0.9085, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5991974216920687, + "learning_rate": 0.0001995620509085228, + "loss": 0.9664, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.5880944284346221, + "learning_rate": 0.00019945941475610623, + "loss": 0.8624, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.7255831004379246, + "learning_rate": 0.0001993460157399396, + "loss": 0.9939, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6374991872522399, + "learning_rate": 0.0001992218661313415, + "loss": 1.0072, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5546079581109932, + "learning_rate": 0.00019908697936499103, + "loss": 0.9612, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5916519032202122, + "learning_rate": 0.00019894137003747403, + "loss": 0.9761, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.6356011070115721, + "learning_rate": 0.00019878505390570362, + "loss": 0.9587, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6024071271891809, + "learning_rate": 0.00019861804788521493, + "loss": 0.8811, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.49232008604791955, + "learning_rate": 0.00019844037004833473, + "loss": 0.8569, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.6053773289823773, + "learning_rate": 0.00019825203962222572, + "loss": 0.997, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.6642190058400091, + "learning_rate": 0.0001980530769868059, + "loss": 0.964, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.4782405704173689, + "learning_rate": 0.00019784350367254322, + "loss": 0.8375, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.6334905713423813, + "learning_rate": 0.0001976233423581255, + "loss": 0.9273, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3998669898594021, + "learning_rate": 0.0001973926168680066, + "loss": 0.8155, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5351402876869878, + "learning_rate": 0.00019715135216982798, + "loss": 0.845, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5473592406773546, + "learning_rate": 0.0001968995743717171, + "loss": 0.876, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.4714152405970396, + "learning_rate": 0.00019663731071946206, + "loss": 0.8893, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.6233030845726812, + "learning_rate": 0.00019636458959356316, + "loss": 0.9926, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5851125542623964, + "learning_rate": 0.0001960814405061619, + "loss": 0.9213, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.48818739539340444, + "learning_rate": 0.00019578789409784727, + "loss": 0.8393, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5942600495624568, + "learning_rate": 0.00019548398213434007, + "loss": 0.9138, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.6155460135079603, + "learning_rate": 0.00019516973750305532, + "loss": 1.0341, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5354548010801756, + "learning_rate": 0.00019484519420954354, + "loss": 0.851, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5029186103592496, + "learning_rate": 0.00019451038737381077, + "loss": 0.9062, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5856045747194191, + "learning_rate": 0.00019416535322651818, + "loss": 0.8925, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5165745934706759, + "learning_rate": 0.00019381012910506146, + "loss": 0.9191, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.5104093507497887, + "learning_rate": 0.00019344475344953012, + "loss": 0.9431, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5324667462814716, + "learning_rate": 0.00019306926579854821, + "loss": 0.8927, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5980004948399099, + "learning_rate": 0.00019268370678499533, + "loss": 0.9729, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5812454848443018, + "learning_rate": 0.0001922881181316097, + "loss": 0.9417, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.6434345018371275, + "learning_rate": 0.00019188254264647337, + "loss": 0.9758, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.8670925963606051, + "learning_rate": 0.0001914670242183795, + "loss": 0.9639, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5142581304786665, + "learning_rate": 0.0001910416078120832, + "loss": 0.8442, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4784472560862407, + "learning_rate": 0.0001906063394634356, + "loss": 0.817, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5192157296603549, + "learning_rate": 0.00019016126627440237, + "loss": 0.9377, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5345814539065935, + "learning_rate": 0.00018970643640796642, + "loss": 0.9973, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.5212648979550981, + "learning_rate": 0.000189241899082916, + "loss": 0.944, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5847828497713395, + "learning_rate": 0.00018876770456851877, + "loss": 0.8486, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.6036505287588615, + "learning_rate": 0.0001882839041790818, + "loss": 0.976, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5185079411903789, + "learning_rate": 0.00018779055026839868, + "loss": 0.8535, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5087496634629276, + "learning_rate": 0.00018728769622408423, + "loss": 0.8656, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.6620924291595747, + "learning_rate": 0.00018677539646179707, + "loss": 1.0019, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4165962007102062, + "learning_rate": 0.00018625370641935129, + "loss": 0.8478, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.6000561388824549, + "learning_rate": 0.00018572268255071718, + "loss": 0.9312, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5645804945586722, + "learning_rate": 0.00018518238231991218, + "loss": 0.8775, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5457958238077547, + "learning_rate": 0.00018463286419478255, + "loss": 0.932, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.4563578229166194, + "learning_rate": 0.00018407418764067627, + "loss": 0.7811, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.6237927980286077, + "learning_rate": 0.00018350641311400812, + "loss": 0.9482, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.6096397902515523, + "learning_rate": 0.0001829296020557174, + "loss": 0.9591, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.61532565982814, + "learning_rate": 0.00018234381688461942, + "loss": 0.9338, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.5366533603653786, + "learning_rate": 0.0001817491209906506, + "loss": 0.9319, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.5165264282327887, + "learning_rate": 0.00018114557872800905, + "loss": 0.935, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.5020374316473182, + "learning_rate": 0.00018053325540819045, + "loss": 0.889, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5429511056967373, + "learning_rate": 0.0001799122172929206, + "loss": 0.8124, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5372888520436341, + "learning_rate": 0.00017928253158698473, + "loss": 0.8855, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5477919990650789, + "learning_rate": 0.0001786442664309554, + "loss": 0.9003, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.4919770372900607, + "learning_rate": 0.0001779974908938184, + "loss": 0.8582, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.49579325741634583, + "learning_rate": 0.0001773422749654988, + "loss": 0.8353, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5782246538890948, + "learning_rate": 0.00017667868954928694, + "loss": 0.9904, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.6235756903654298, + "learning_rate": 0.00017600680645416583, + "loss": 0.9499, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.5559031253986597, + "learning_rate": 0.00017532669838704035, + "loss": 0.9486, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.5161196463465147, + "learning_rate": 0.00017463843894486937, + "loss": 0.7823, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.48046584574072526, + "learning_rate": 0.0001739421026067017, + "loss": 0.8563, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5644927020207363, + "learning_rate": 0.00017323776472561627, + "loss": 0.939, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5262516802613253, + "learning_rate": 0.00017252550152056795, + "loss": 0.9095, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5162214092865526, + "learning_rate": 0.0001718053900681397, + "loss": 0.9218, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.48066263602690296, + "learning_rate": 0.00017107750829420176, + "loss": 0.8351, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.46511514585809155, + "learning_rate": 0.00017034193496547902, + "loss": 0.8347, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.47764732381060815, + "learning_rate": 0.00016959874968102735, + "loss": 0.8668, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5391492918269589, + "learning_rate": 0.00016884803286362, + "loss": 0.9166, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5452368893224666, + "learning_rate": 0.00016808986575104465, + "loss": 0.9434, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.5068755875108313, + "learning_rate": 0.00016732433038731242, + "loss": 0.9381, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4561249918064821, + "learning_rate": 0.0001665515096137797, + "loss": 0.7897, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.47303478518505127, + "learning_rate": 0.00016577148706018328, + "loss": 0.7576, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.5364559677774835, + "learning_rate": 0.00016498434713559088, + "loss": 0.9363, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.552194133620672, + "learning_rate": 0.00016419017501926656, + "loss": 0.8602, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.6092151390114966, + "learning_rate": 0.0001633890566514535, + "loss": 0.9392, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4469527891085572, + "learning_rate": 0.00016258107872407375, + "loss": 0.8284, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5466970963402339, + "learning_rate": 0.0001617663286713474, + "loss": 0.8422, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4793293027604878, + "learning_rate": 0.00016094489466033043, + "loss": 0.8016, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5574466292258964, + "learning_rate": 0.00016011686558137448, + "loss": 0.8667, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.4607255432613611, + "learning_rate": 0.0001592823310385073, + "loss": 0.8439, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5861238704023553, + "learning_rate": 0.0001584413813397364, + "loss": 0.8349, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.5067045689470819, + "learning_rate": 0.00015759410748727662, + "loss": 0.8722, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5745265335709175, + "learning_rate": 0.00015674060116770236, + "loss": 0.9768, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.6483302411754751, + "learning_rate": 0.00015588095474202595, + "loss": 0.8501, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.5692191325540262, + "learning_rate": 0.00015501526123570277, + "loss": 0.9932, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.7082524782346505, + "learning_rate": 0.00015414361432856475, + "loss": 0.9979, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4700388644962814, + "learning_rate": 0.0001532661083446829, + "loss": 0.8249, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4565116683642803, + "learning_rate": 0.00015238283824216015, + "loss": 0.7952, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.5252540630460191, + "learning_rate": 0.00015149389960285558, + "loss": 0.8311, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.5200739744170301, + "learning_rate": 0.00015059938862204127, + "loss": 0.8811, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.44622703844460526, + "learning_rate": 0.00014969940209799248, + "loss": 0.8323, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.500576250224737, + "learning_rate": 0.00014879403742151283, + "loss": 0.883, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.46685072068441685, + "learning_rate": 0.00014788339256539544, + "loss": 0.8068, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.49083712897749426, + "learning_rate": 0.0001469675660738206, + "loss": 0.9192, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.41453739036231607, + "learning_rate": 0.00014604665705169237, + "loss": 0.7524, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4723495222814731, + "learning_rate": 0.00014512076515391375, + "loss": 0.8452, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.6478344448359308, + "learning_rate": 0.00014418999057460276, + "loss": 1.0142, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5144296129212168, + "learning_rate": 0.0001432544340362501, + "loss": 0.8394, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.6328551875817262, + "learning_rate": 0.00014231419677881966, + "loss": 1.0413, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.42502713429252176, + "learning_rate": 0.00014136938054879283, + "loss": 0.7571, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.544857543401765, + "learning_rate": 0.00014042008758815818, + "loss": 0.9066, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.5182206067034031, + "learning_rate": 0.00013946642062334766, + "loss": 0.8623, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.5418868636953043, + "learning_rate": 0.00013850848285411994, + "loss": 0.8537, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.5521729198502818, + "learning_rate": 0.000137546377942393, + "loss": 0.944, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.4584591055005934, + "learning_rate": 0.00013658021000102636, + "loss": 0.8743, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5864975864579389, + "learning_rate": 0.00013561008358255468, + "loss": 0.9845, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.5078042178534576, + "learning_rate": 0.00013463610366787392, + "loss": 0.8262, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4581746905213, + "learning_rate": 0.00013365837565488064, + "loss": 0.827, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.5188671706161625, + "learning_rate": 0.0001326770053470668, + "loss": 0.8164, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.5207471584962147, + "learning_rate": 0.0001316920989420703, + "loss": 0.9056, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.480373287574047, + "learning_rate": 0.00013070376302018287, + "loss": 0.8821, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5873723875897096, + "learning_rate": 0.00012971210453281674, + "loss": 0.9182, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.46572587728199677, + "learning_rate": 0.000128717230790931, + "loss": 0.7844, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.5329092227933953, + "learning_rate": 0.00012771924945341906, + "loss": 0.9006, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.47219789926354094, + "learning_rate": 0.00012671826851545851, + "loss": 0.8032, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.6436238273337977, + "learning_rate": 0.0001257143962968246, + "loss": 0.8897, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.44969251141355987, + "learning_rate": 0.00012470774143016853, + "loss": 0.8098, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.7065052879539846, + "learning_rate": 0.00012369841284926188, + "loss": 0.9539, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.49430107652669947, + "learning_rate": 0.00012268651977720866, + "loss": 0.8396, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.44278337903710574, + "learning_rate": 0.00012167217171462566, + "loss": 0.7493, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.45657585903503006, + "learning_rate": 0.0001206554784277931, + "loss": 0.8399, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.5594305788233783, + "learning_rate": 0.00011963654993677645, + "loss": 0.7511, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4731245915223696, + "learning_rate": 0.00011861549650352069, + "loss": 0.8638, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.44866720451901915, + "learning_rate": 0.00011759242861991855, + "loss": 0.7519, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.5049157207422073, + "learning_rate": 0.00011656745699585371, + "loss": 0.8531, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4622961459788511, + "learning_rate": 0.00011554069254722051, + "loss": 0.7997, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.5011867957087826, + "learning_rate": 0.00011451224638392129, + "loss": 0.8336, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.44473343514751135, + "learning_rate": 0.00011348222979784289, + "loss": 0.8141, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.49174176435012285, + "learning_rate": 0.00011245075425081328, + "loss": 0.9179, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.4212233224278149, + "learning_rate": 0.00011141793136253986, + "loss": 0.756, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.42131704733259884, + "learning_rate": 0.0001103838728985307, + "loss": 0.774, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.43434312084011045, + "learning_rate": 0.000109348690758, + "loss": 0.7475, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5378430206230584, + "learning_rate": 0.00010831249696175918, + "loss": 0.8769, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4227908343847277, + "learning_rate": 0.0001072754036400944, + "loss": 0.7987, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.4613617814492636, + "learning_rate": 0.00010623752302063283, + "loss": 0.8833, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.40191007335151063, + "learning_rate": 0.00010519896741619803, + "loss": 0.7724, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5874932967179956, + "learning_rate": 0.00010415984921265609, + "loss": 0.9077, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.47018012728633896, + "learning_rate": 0.00010312028085675391, + "loss": 0.8459, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.5135569343532309, + "learning_rate": 0.00010208037484395114, + "loss": 0.8724, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.4711547922176534, + "learning_rate": 0.00010104024370624644, + "loss": 0.8732, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4605391208636547, + "learning_rate": 0.0001, + "loss": 0.7958, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.5162693845942066, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8359, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6790590303772195, + "learning_rate": 9.791962515604887e-05, + "loss": 0.9214, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4154627258730887, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7592, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.49204958681426475, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8947, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.48299009755135697, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8093, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 1.3967726735458745, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7723, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.47547860623397964, + "learning_rate": 9.272459635990562e-05, + "loss": 0.8072, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5521765898351926, + "learning_rate": 9.168750303824084e-05, + "loss": 0.8858, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.5403612499302933, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7971, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5678871132109066, + "learning_rate": 8.961612710146934e-05, + "loss": 0.8678, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.5085771385434323, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8054, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.5265619363777059, + "learning_rate": 8.754924574918675e-05, + "loss": 0.9647, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4905195302788644, + "learning_rate": 8.651777020215712e-05, + "loss": 0.868, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.5282205277029001, + "learning_rate": 8.548775361607872e-05, + "loss": 0.8057, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.5404677713988845, + "learning_rate": 8.445930745277953e-05, + "loss": 0.919, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.5081939196669241, + "learning_rate": 8.343254300414628e-05, + "loss": 0.8112, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.701368188891065, + "learning_rate": 8.240757138008149e-05, + "loss": 0.9355, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.47644014020348635, + "learning_rate": 8.138450349647936e-05, + "loss": 0.8545, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.4479800438378306, + "learning_rate": 8.036345006322359e-05, + "loss": 0.803, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4568365659077264, + "learning_rate": 7.934452157220694e-05, + "loss": 0.721, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5628625002310027, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8664, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.39193950097624564, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7784, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.44621968664447376, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8036, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.6552546382007541, + "learning_rate": 7.52922585698315e-05, + "loss": 0.8681, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4444323128940648, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8249, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4096211039326677, + "learning_rate": 7.328173148454151e-05, + "loss": 0.8002, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.41374321969091965, + "learning_rate": 7.228075054658096e-05, + "loss": 0.8164, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.6486742789837998, + "learning_rate": 7.1282769209069e-05, + "loss": 1.0459, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.5897416604730881, + "learning_rate": 7.028789546718326e-05, + "loss": 0.8359, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.46776013489921014, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7546, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.5175426743395212, + "learning_rate": 6.830790105792973e-05, + "loss": 0.8506, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.7039149380745073, + "learning_rate": 6.732299465293322e-05, + "loss": 0.9374, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.45067267248124315, + "learning_rate": 6.63416243451194e-05, + "loss": 0.8746, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.549831797921143, + "learning_rate": 6.536389633212609e-05, + "loss": 0.9139, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3967114644899662, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7546, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.46586065767273077, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7676, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.5225256866262378, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7611, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.463024339644472, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7903, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.41252826026591827, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7522, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5835285744859474, + "learning_rate": 5.957991241184184e-05, + "loss": 0.8858, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.449279529912864, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7245, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.6482288766459807, + "learning_rate": 5.768580322118034e-05, + "loss": 0.9016, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4594998006796061, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.782, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.8646261954648369, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.8832, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.6800798821111771, + "learning_rate": 5.487923484608629e-05, + "loss": 0.9053, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4255077691036116, + "learning_rate": 5.395334294830765e-05, + "loss": 0.7375, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.41644886027642136, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.8249, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.5209064217397174, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7697, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.5498352812442217, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7928, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.5413265947767266, + "learning_rate": 5.030059790200756e-05, + "loss": 0.8645, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5288103110126229, + "learning_rate": 4.940061137795876e-05, + "loss": 0.8829, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5011933130582199, + "learning_rate": 4.850610039714444e-05, + "loss": 0.842, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.45515517378700693, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8259, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.6101065320239799, + "learning_rate": 4.673389165531714e-05, + "loss": 0.9956, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.46840904584275894, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8131, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3663759497337274, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6838, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4937025705477826, + "learning_rate": 4.411904525797408e-05, + "loss": 0.834, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4595324612373292, + "learning_rate": 4.325939883229766e-05, + "loss": 0.8017, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.47738605752675983, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8044, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.5332094621243066, + "learning_rate": 4.155861866026364e-05, + "loss": 0.8264, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.41762127444184166, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7573, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5026812226060394, + "learning_rate": 3.988313441862553e-05, + "loss": 0.8486, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4665822142084424, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7489, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.577246877272033, + "learning_rate": 3.823367132865265e-05, + "loss": 0.8626, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5765141180700714, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7646, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4777580846188267, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7975, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.561248592643386, + "learning_rate": 3.580982498073344e-05, + "loss": 0.8662, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.5297059368674798, + "learning_rate": 3.501565286440914e-05, + "loss": 0.9775, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.7058474858470783, + "learning_rate": 3.422851293981676e-05, + "loss": 0.8818, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5339336266613487, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.8618, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.4834312926665495, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7587, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.581475851985833, + "learning_rate": 3.191013424895536e-05, + "loss": 0.8332, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5109554184489444, + "learning_rate": 3.115196713638e-05, + "loss": 0.7822, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.4843338647071073, + "learning_rate": 3.040125031897264e-05, + "loss": 0.8022, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.44693796537617286, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.8174, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.43703907272285714, + "learning_rate": 2.892249170579826e-05, + "loss": 0.8016, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.49899162998005303, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7687, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.42034349795266884, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7104, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.4479587754924112, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7968, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4244693351073528, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.6858, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4589141075325882, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7318, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.47161139886206915, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.8238, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.47576380142421015, + "learning_rate": 2.399319354583418e-05, + "loss": 0.7973, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.493338423791926, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.9004, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5100402517048367, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7534, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.5112008725560001, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.8408, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.41032245286755115, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7524, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.43838327663853965, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7647, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.44398343084308467, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7664, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4533979485061938, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7551, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.40177835310822313, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7808, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.47981196343744154, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7776, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4616612061088841, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.7976, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.5310570995829195, + "learning_rate": 1.707039794428259e-05, + "loss": 0.7568, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.4361654306179982, + "learning_rate": 1.649358688599191e-05, + "loss": 0.8239, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4640496078272689, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.8167, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.443990966442962, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7608, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.43147696653912865, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7399, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.4187729075670258, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7695, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5066517649172814, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7925, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.40619158877482436, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7577, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.366965454218498, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7599, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.39262286158044624, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7374, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.4832698354941956, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.8533, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5287128800831273, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.8266, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.43576896063036646, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.8129, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.48231800241057715, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7813, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.597075273021654, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7735, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.6134868404814026, + "learning_rate": 9.393660536564408e-06, + "loss": 0.9541, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.5919719755939148, + "learning_rate": 8.958392187916841e-06, + "loss": 0.94, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4588676200704114, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7604, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4545424822405747, + "learning_rate": 8.117457353526625e-06, + "loss": 0.8459, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.5154234986338997, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7062, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.6061436738637195, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7213, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.5690397032723647, + "learning_rate": 6.930734201451816e-06, + "loss": 0.806, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.6929430015219535, + "learning_rate": 6.555246550469907e-06, + "loss": 0.9031, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4636024177548296, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7045, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.5590861235770141, + "learning_rate": 5.834646773481811e-06, + "loss": 0.8568, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.49918272298276084, + "learning_rate": 5.489612626189245e-06, + "loss": 0.8063, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.5159744366222171, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7546, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4454372767204154, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7733, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.40366650002590854, + "learning_rate": 4.516017865659949e-06, + "loss": 0.7058, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.5773230798700426, + "learning_rate": 4.21210590215273e-06, + "loss": 0.8508, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.48932838121103617, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7155, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.49731206361843744, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.8277, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.37277364446149414, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7524, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4810886401899049, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8317, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.45269090426882497, + "learning_rate": 2.848647830172024e-06, + "loss": 0.8463, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.4737738227998511, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7841, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.6310130183124479, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.9008, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.42158071300460925, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6928, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.5367004719693059, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7719, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.6060188160837613, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.9157, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.4869523495961573, + "learning_rate": 1.559629951665298e-06, + "loss": 0.75, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.7323792302024977, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.8785, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.43686635101321636, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.8171, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.44734722608116156, + "learning_rate": 1.05862996252597e-06, + "loss": 0.8478, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4646935536379244, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7162, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.5157929065786432, + "learning_rate": 7.781338686584927e-07, + "loss": 0.8251, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.7275381901396758, + "learning_rate": 6.539842600603918e-07, + "loss": 0.9507, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4183990280164646, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7522, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.5457365142535572, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.8629, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.49843708727416114, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7987, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.4146234731795237, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.8141, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5053500587148759, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8026, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.6588712440547313, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.8751, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.45240332151104545, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7212, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.6437603542272565, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7909, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.5013273996649436, + "learning_rate": 2.164213936770576e-08, + "loss": 0.8622, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.47420264885035235, + "learning_rate": 5.410681219286673e-09, + "loss": 0.7697, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.564032619501475, + "learning_rate": 0.0, + "loss": 0.7968, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 262275390177280.0, + "train_loss": 0.859515962501367, + "train_runtime": 4688.9181, + "train_samples_per_second": 1.066, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 262275390177280.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..351e80d947298d787e4461efdc69c87ae5993517 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "down_proj", + "o_proj", + "k_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..af30dbbf5d30aef832bdcbae95f7fc31a15967f5 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:808d35cb6452a2953789a7d20aa8a1c96c5c3cedaf37f9a8e8c7ddcfdd38315a +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..c5f876cec900beb9a602eca825f0509b847206dd --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c91de22c69677f2d751b762b1108d0eccbd85445f907b041626f609e8672a555 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..12eb455df334952bcb878c781a8e4de871bf4cba --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,1134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064, + "grad_norm": 0.6531013943101484, + "learning_rate": 4e-05, + "loss": 1.1983, + "step": 1 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7243027041411867, + "learning_rate": 8e-05, + "loss": 1.352, + "step": 2 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6059556630796344, + "learning_rate": 0.00012, + "loss": 1.1418, + "step": 3 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5897789101722263, + "learning_rate": 0.00016, + "loss": 1.1574, + "step": 4 + }, + { + "epoch": 0.032, + "grad_norm": 0.9907059972693204, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 5 + }, + { + "epoch": 0.0384, + "grad_norm": 0.7153029742668117, + "learning_rate": 0.0001999783578606323, + "loss": 1.054, + "step": 6 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5162952305618812, + "learning_rate": 0.0001999134408101731, + "loss": 1.0066, + "step": 7 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4560437547903583, + "learning_rate": 0.00019980527694749952, + "loss": 0.935, + "step": 8 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4591730374379789, + "learning_rate": 0.0001996539130905593, + "loss": 0.9422, + "step": 9 + }, + { + "epoch": 0.064, + "grad_norm": 0.4316152901290879, + "learning_rate": 0.00019945941475610623, + "loss": 0.9307, + "step": 10 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5647897788044418, + "learning_rate": 0.0001992218661313415, + "loss": 1.0211, + "step": 11 + }, + { + "epoch": 0.0768, + "grad_norm": 0.48294255234229355, + "learning_rate": 0.00019894137003747403, + "loss": 0.9788, + "step": 12 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5096172315529667, + "learning_rate": 0.00019861804788521493, + "loss": 0.9426, + "step": 13 + }, + { + "epoch": 0.0896, + "grad_norm": 0.44088678017689265, + "learning_rate": 0.00019825203962222572, + "loss": 0.9372, + "step": 14 + }, + { + "epoch": 0.096, + "grad_norm": 0.4682252458499051, + "learning_rate": 0.00019784350367254322, + "loss": 0.9121, + "step": 15 + }, + { + "epoch": 0.1024, + "grad_norm": 0.39797730428554234, + "learning_rate": 0.0001973926168680066, + "loss": 0.8869, + "step": 16 + }, + { + "epoch": 0.1088, + "grad_norm": 0.3864393862644587, + "learning_rate": 0.0001968995743717171, + "loss": 0.8581, + "step": 17 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4188942691273693, + "learning_rate": 0.00019636458959356316, + "loss": 0.9515, + "step": 18 + }, + { + "epoch": 0.1216, + "grad_norm": 0.36548225855019706, + "learning_rate": 0.00019578789409784727, + "loss": 0.8751, + "step": 19 + }, + { + "epoch": 0.128, + "grad_norm": 0.411547543546142, + "learning_rate": 0.00019516973750305532, + "loss": 0.9719, + "step": 20 + }, + { + "epoch": 0.1344, + "grad_norm": 0.39409864447281595, + "learning_rate": 0.00019451038737381077, + "loss": 0.8776, + "step": 21 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4022259169634103, + "learning_rate": 0.00019381012910506146, + "loss": 0.9015, + "step": 22 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3754872706030276, + "learning_rate": 0.00019306926579854821, + "loss": 0.9216, + "step": 23 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4224595812307275, + "learning_rate": 0.0001922881181316097, + "loss": 0.9599, + "step": 24 + }, + { + "epoch": 0.16, + "grad_norm": 0.5223489655078559, + "learning_rate": 0.0001914670242183795, + "loss": 0.9646, + "step": 25 + }, + { + "epoch": 0.1664, + "grad_norm": 0.35308810704754584, + "learning_rate": 0.0001906063394634356, + "loss": 0.8193, + "step": 26 + }, + { + "epoch": 0.1728, + "grad_norm": 0.388007287057223, + "learning_rate": 0.00018970643640796642, + "loss": 0.9603, + "step": 27 + }, + { + "epoch": 0.1792, + "grad_norm": 0.38353615191183243, + "learning_rate": 0.00018876770456851877, + "loss": 0.885, + "step": 28 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3984587603283565, + "learning_rate": 0.00018779055026839868, + "loss": 0.9116, + "step": 29 + }, + { + "epoch": 0.192, + "grad_norm": 0.3895086416416209, + "learning_rate": 0.00018677539646179707, + "loss": 0.9265, + "step": 30 + }, + { + "epoch": 0.1984, + "grad_norm": 0.39407543468034756, + "learning_rate": 0.00018572268255071718, + "loss": 0.8866, + "step": 31 + }, + { + "epoch": 0.2048, + "grad_norm": 0.41353543458783343, + "learning_rate": 0.00018463286419478255, + "loss": 0.8962, + "step": 32 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3898100986337053, + "learning_rate": 0.00018350641311400812, + "loss": 0.8536, + "step": 33 + }, + { + "epoch": 0.2176, + "grad_norm": 0.44138844308168373, + "learning_rate": 0.00018234381688461942, + "loss": 0.9389, + "step": 34 + }, + { + "epoch": 0.224, + "grad_norm": 0.38403391055992697, + "learning_rate": 0.00018114557872800905, + "loss": 0.9249, + "step": 35 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3554715924546394, + "learning_rate": 0.0001799122172929206, + "loss": 0.8369, + "step": 36 + }, + { + "epoch": 0.2368, + "grad_norm": 0.38916688084355255, + "learning_rate": 0.0001786442664309554, + "loss": 0.8834, + "step": 37 + }, + { + "epoch": 0.2432, + "grad_norm": 0.37673094982431743, + "learning_rate": 0.0001773422749654988, + "loss": 0.8339, + "step": 38 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4410145022484197, + "learning_rate": 0.00017600680645416583, + "loss": 0.9644, + "step": 39 + }, + { + "epoch": 0.256, + "grad_norm": 0.4219801789021562, + "learning_rate": 0.00017463843894486937, + "loss": 0.8563, + "step": 40 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3878395192466562, + "learning_rate": 0.00017323776472561627, + "loss": 0.8787, + "step": 41 + }, + { + "epoch": 0.2688, + "grad_norm": 0.380156955704002, + "learning_rate": 0.0001718053900681397, + "loss": 0.8989, + "step": 42 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3337757040266273, + "learning_rate": 0.00017034193496547902, + "loss": 0.8251, + "step": 43 + }, + { + "epoch": 0.2816, + "grad_norm": 0.373924455891821, + "learning_rate": 0.00016884803286362, + "loss": 0.8769, + "step": 44 + }, + { + "epoch": 0.288, + "grad_norm": 0.40584629185302074, + "learning_rate": 0.00016732433038731242, + "loss": 0.9348, + "step": 45 + }, + { + "epoch": 0.2944, + "grad_norm": 0.35282236451512666, + "learning_rate": 0.00016577148706018328, + "loss": 0.7637, + "step": 46 + }, + { + "epoch": 0.3008, + "grad_norm": 0.40393619317262786, + "learning_rate": 0.00016419017501926656, + "loss": 0.8946, + "step": 47 + }, + { + "epoch": 0.3072, + "grad_norm": 0.40971995700029845, + "learning_rate": 0.00016258107872407375, + "loss": 0.8762, + "step": 48 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3627260591138329, + "learning_rate": 0.00016094489466033043, + "loss": 0.8106, + "step": 49 + }, + { + "epoch": 0.32, + "grad_norm": 0.3680533266179677, + "learning_rate": 0.0001592823310385073, + "loss": 0.8472, + "step": 50 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4015581486261684, + "learning_rate": 0.00015759410748727662, + "loss": 0.8441, + "step": 51 + }, + { + "epoch": 0.3328, + "grad_norm": 0.4100193735092169, + "learning_rate": 0.00015588095474202595, + "loss": 0.898, + "step": 52 + }, + { + "epoch": 0.3392, + "grad_norm": 0.47180488656682523, + "learning_rate": 0.00015414361432856475, + "loss": 0.9866, + "step": 53 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3523624395079363, + "learning_rate": 0.00015238283824216015, + "loss": 0.8007, + "step": 54 + }, + { + "epoch": 0.352, + "grad_norm": 0.35726184720092874, + "learning_rate": 0.00015059938862204127, + "loss": 0.8406, + "step": 55 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3530040185899825, + "learning_rate": 0.00014879403742151283, + "loss": 0.8484, + "step": 56 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3528784008170458, + "learning_rate": 0.0001469675660738206, + "loss": 0.8558, + "step": 57 + }, + { + "epoch": 0.3712, + "grad_norm": 0.31937072880784756, + "learning_rate": 0.00014512076515391375, + "loss": 0.7895, + "step": 58 + }, + { + "epoch": 0.3776, + "grad_norm": 0.41956435966393973, + "learning_rate": 0.0001432544340362501, + "loss": 0.9199, + "step": 59 + }, + { + "epoch": 0.384, + "grad_norm": 0.4139326323202461, + "learning_rate": 0.00014136938054879283, + "loss": 0.8905, + "step": 60 + }, + { + "epoch": 0.3904, + "grad_norm": 0.40453670887947585, + "learning_rate": 0.00013946642062334766, + "loss": 0.8775, + "step": 61 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4374110458546487, + "learning_rate": 0.000137546377942393, + "loss": 0.8928, + "step": 62 + }, + { + "epoch": 0.4032, + "grad_norm": 0.39023942568288056, + "learning_rate": 0.00013561008358255468, + "loss": 0.918, + "step": 63 + }, + { + "epoch": 0.4096, + "grad_norm": 0.34857819904427056, + "learning_rate": 0.00013365837565488064, + "loss": 0.8166, + "step": 64 + }, + { + "epoch": 0.416, + "grad_norm": 0.3685005339493162, + "learning_rate": 0.0001316920989420703, + "loss": 0.8532, + "step": 65 + }, + { + "epoch": 0.4224, + "grad_norm": 0.38789784924823806, + "learning_rate": 0.00012971210453281674, + "loss": 0.8865, + "step": 66 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4780610692107015, + "learning_rate": 0.00012771924945341906, + "loss": 0.8379, + "step": 67 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4235659969652539, + "learning_rate": 0.0001257143962968246, + "loss": 0.8419, + "step": 68 + }, + { + "epoch": 0.4416, + "grad_norm": 0.41517088760561804, + "learning_rate": 0.00012369841284926188, + "loss": 0.8682, + "step": 69 + }, + { + "epoch": 0.448, + "grad_norm": 0.34436625989604225, + "learning_rate": 0.00012167217171462566, + "loss": 0.7835, + "step": 70 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3500455919830399, + "learning_rate": 0.00011963654993677645, + "loss": 0.7909, + "step": 71 + }, + { + "epoch": 0.4608, + "grad_norm": 0.34110031360566156, + "learning_rate": 0.00011759242861991855, + "loss": 0.8002, + "step": 72 + }, + { + "epoch": 0.4672, + "grad_norm": 0.34510689610220674, + "learning_rate": 0.00011554069254722051, + "loss": 0.8199, + "step": 73 + }, + { + "epoch": 0.4736, + "grad_norm": 0.36725453965384564, + "learning_rate": 0.00011348222979784289, + "loss": 0.818, + "step": 74 + }, + { + "epoch": 0.48, + "grad_norm": 0.34943402862495193, + "learning_rate": 0.00011141793136253986, + "loss": 0.8318, + "step": 75 + }, + { + "epoch": 0.4864, + "grad_norm": 0.31133790718180526, + "learning_rate": 0.000109348690758, + "loss": 0.7547, + "step": 76 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3658484722990489, + "learning_rate": 0.0001072754036400944, + "loss": 0.8356, + "step": 77 + }, + { + "epoch": 0.4992, + "grad_norm": 0.32129076781797744, + "learning_rate": 0.00010519896741619803, + "loss": 0.8278, + "step": 78 + }, + { + "epoch": 0.5056, + "grad_norm": 0.38942837430839766, + "learning_rate": 0.00010312028085675391, + "loss": 0.8707, + "step": 79 + }, + { + "epoch": 0.512, + "grad_norm": 0.35123170973120066, + "learning_rate": 0.00010104024370624644, + "loss": 0.8644, + "step": 80 + }, + { + "epoch": 0.5184, + "grad_norm": 0.5354527809050915, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8145, + "step": 81 + }, + { + "epoch": 0.5248, + "grad_norm": 0.40673598485008916, + "learning_rate": 9.687971914324607e-05, + "loss": 0.8408, + "step": 82 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3524570688187097, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8466, + "step": 83 + }, + { + "epoch": 0.5376, + "grad_norm": 0.33636392603349363, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7859, + "step": 84 + }, + { + "epoch": 0.544, + "grad_norm": 0.39915672090481497, + "learning_rate": 9.065130924199998e-05, + "loss": 0.8393, + "step": 85 + }, + { + "epoch": 0.5504, + "grad_norm": 0.39886971290229895, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8354, + "step": 86 + }, + { + "epoch": 0.5568, + "grad_norm": 0.37555348433730135, + "learning_rate": 8.651777020215712e-05, + "loss": 0.917, + "step": 87 + }, + { + "epoch": 0.5632, + "grad_norm": 0.37601475500174525, + "learning_rate": 8.445930745277953e-05, + "loss": 0.859, + "step": 88 + }, + { + "epoch": 0.5696, + "grad_norm": 0.38865844776125025, + "learning_rate": 8.240757138008149e-05, + "loss": 0.8678, + "step": 89 + }, + { + "epoch": 0.576, + "grad_norm": 0.3307037617178496, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8275, + "step": 90 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3666715522668213, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7919, + "step": 91 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3084693433451157, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7943, + "step": 92 + }, + { + "epoch": 0.5952, + "grad_norm": 0.37862887312694415, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8477, + "step": 93 + }, + { + "epoch": 0.6016, + "grad_norm": 0.2885506950523641, + "learning_rate": 7.228075054658096e-05, + "loss": 0.8062, + "step": 94 + }, + { + "epoch": 0.608, + "grad_norm": 0.4362567348068058, + "learning_rate": 7.028789546718326e-05, + "loss": 0.9364, + "step": 95 + }, + { + "epoch": 0.6144, + "grad_norm": 0.35793369277964976, + "learning_rate": 6.830790105792973e-05, + "loss": 0.8017, + "step": 96 + }, + { + "epoch": 0.6208, + "grad_norm": 0.41198650847935414, + "learning_rate": 6.63416243451194e-05, + "loss": 0.9099, + "step": 97 + }, + { + "epoch": 0.6272, + "grad_norm": 0.34412292057512806, + "learning_rate": 6.43899164174453e-05, + "loss": 0.834, + "step": 98 + }, + { + "epoch": 0.6336, + "grad_norm": 0.36291105966317105, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7662, + "step": 99 + }, + { + "epoch": 0.64, + "grad_norm": 0.3190014750409097, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7709, + "step": 100 + }, + { + "epoch": 0.6464, + "grad_norm": 0.36727250643356163, + "learning_rate": 5.863061945120719e-05, + "loss": 0.8057, + "step": 101 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4044973217971893, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8477, + "step": 102 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4458033062134064, + "learning_rate": 5.487923484608629e-05, + "loss": 0.8934, + "step": 103 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3065643632920914, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7804, + "step": 104 + }, + { + "epoch": 0.672, + "grad_norm": 0.3775122540545165, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7831, + "step": 105 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4038170954360674, + "learning_rate": 4.940061137795876e-05, + "loss": 0.8703, + "step": 106 + }, + { + "epoch": 0.6848, + "grad_norm": 0.34770328740695106, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8374, + "step": 107 + }, + { + "epoch": 0.6912, + "grad_norm": 0.38937427611703035, + "learning_rate": 4.585638567143529e-05, + "loss": 0.9036, + "step": 108 + }, + { + "epoch": 0.6976, + "grad_norm": 0.31386321764753544, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7616, + "step": 109 + }, + { + "epoch": 0.704, + "grad_norm": 0.34254419298536803, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8045, + "step": 110 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3363058538370427, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7922, + "step": 111 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3389140844461325, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.8007, + "step": 112 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4126704473190358, + "learning_rate": 3.741892127592625e-05, + "loss": 0.8139, + "step": 113 + }, + { + "epoch": 0.7296, + "grad_norm": 0.5197748600439888, + "learning_rate": 3.580982498073344e-05, + "loss": 0.8356, + "step": 114 + }, + { + "epoch": 0.736, + "grad_norm": 0.45172982428856284, + "learning_rate": 3.422851293981676e-05, + "loss": 0.9337, + "step": 115 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3537053348496251, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.8148, + "step": 116 + }, + { + "epoch": 0.7488, + "grad_norm": 0.39124529440222766, + "learning_rate": 3.115196713638e-05, + "loss": 0.8159, + "step": 117 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3407576980401783, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.8123, + "step": 118 + }, + { + "epoch": 0.7616, + "grad_norm": 0.34222429427126, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7942, + "step": 119 + }, + { + "epoch": 0.768, + "grad_norm": 0.319583365059282, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7581, + "step": 120 + }, + { + "epoch": 0.7744, + "grad_norm": 0.314408080260358, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7116, + "step": 121 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3224958018920519, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8122, + "step": 122 + }, + { + "epoch": 0.7872, + "grad_norm": 0.37349316101434166, + "learning_rate": 2.265772503450122e-05, + "loss": 0.832, + "step": 123 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3438868250840104, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.8047, + "step": 124 + }, + { + "epoch": 0.8, + "grad_norm": 0.3197518559056604, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7704, + "step": 125 + }, + { + "epoch": 0.8064, + "grad_norm": 0.29119469695454026, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7735, + "step": 126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.349918604093867, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.792, + "step": 127 + }, + { + "epoch": 0.8192, + "grad_norm": 0.30979893873259173, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7985, + "step": 128 + }, + { + "epoch": 0.8256, + "grad_norm": 0.333052459268458, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7935, + "step": 129 + }, + { + "epoch": 0.832, + "grad_norm": 0.29898578402757314, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7592, + "step": 130 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3381948350076578, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.779, + "step": 131 + }, + { + "epoch": 0.8448, + "grad_norm": 0.2753527457754542, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7558, + "step": 132 + }, + { + "epoch": 0.8512, + "grad_norm": 0.366685214156942, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.8436, + "step": 133 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3372299539032351, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.8017, + "step": 134 + }, + { + "epoch": 0.864, + "grad_norm": 0.4175262755242993, + "learning_rate": 9.393660536564408e-06, + "loss": 0.8725, + "step": 135 + }, + { + "epoch": 0.8704, + "grad_norm": 0.37915892849178806, + "learning_rate": 8.532975781620512e-06, + "loss": 0.8524, + "step": 136 + }, + { + "epoch": 0.8768, + "grad_norm": 0.33639199375721807, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7804, + "step": 137 + }, + { + "epoch": 0.8832, + "grad_norm": 0.36256260056872164, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7661, + "step": 138 + }, + { + "epoch": 0.8896, + "grad_norm": 0.40545845722573964, + "learning_rate": 6.189870894938587e-06, + "loss": 0.8072, + "step": 139 + }, + { + "epoch": 0.896, + "grad_norm": 0.37483517096650165, + "learning_rate": 5.489612626189245e-06, + "loss": 0.8385, + "step": 140 + }, + { + "epoch": 0.9024, + "grad_norm": 0.34187683963694776, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7739, + "step": 141 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3604054800265124, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7892, + "step": 142 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3396109345852707, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7809, + "step": 143 + }, + { + "epoch": 0.9216, + "grad_norm": 0.30021987701543046, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7996, + "step": 144 + }, + { + "epoch": 0.928, + "grad_norm": 0.336807864327852, + "learning_rate": 2.607383131993424e-06, + "loss": 0.8249, + "step": 145 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3840835764416319, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.8061, + "step": 146 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4008265556034786, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.851, + "step": 147 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4656710783388933, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.8236, + "step": 148 + }, + { + "epoch": 0.9536, + "grad_norm": 0.323469647972122, + "learning_rate": 1.05862996252597e-06, + "loss": 0.837, + "step": 149 + }, + { + "epoch": 0.96, + "grad_norm": 0.34449303085338007, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7767, + "step": 150 + }, + { + "epoch": 0.9664, + "grad_norm": 0.6216286447446, + "learning_rate": 5.405852438937764e-07, + "loss": 0.8622, + "step": 151 + }, + { + "epoch": 0.9728, + "grad_norm": 0.38081216442894966, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.8385, + "step": 152 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3392711812270996, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8156, + "step": 153 + }, + { + "epoch": 0.9856, + "grad_norm": 0.433515843854761, + "learning_rate": 8.655918982689581e-08, + "loss": 0.8067, + "step": 154 + }, + { + "epoch": 0.992, + "grad_norm": 0.3707318180470919, + "learning_rate": 2.164213936770576e-08, + "loss": 0.8388, + "step": 155 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3630076689590227, + "learning_rate": 0.0, + "loss": 0.7866, + "step": 156 + }, + { + "epoch": 0.9984, + "step": 156, + "total_flos": 374742464462848.0, + "train_loss": 0.8615743120511373, + "train_runtime": 4679.3893, + "train_samples_per_second": 1.069, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 156, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 374742464462848.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a4de3a79f7b1c90c854454a91043154efefedf88 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "gate_proj", + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5806afdc2dc689aeafbfdbea7b4ab9c99efcb0d2 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b505efac2d259a2a47415dbc3f6c07333dda8a83f3d749a271eda8e07564ab4 +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..fcf0df93197629b22ecdd0ff452dea76300090c1 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:445dbb65de13638be7e2353171e362ec89018043af53abfb1f47ae272d3a7a39 +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2bf1dd59b26fde221c3875db0bce8dfa8f9499c6 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9853113996014804, + "learning_rate": 2e-05, + "loss": 1.3897, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9506426475728363, + "learning_rate": 4e-05, + "loss": 1.2433, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7596966755410303, + "learning_rate": 6e-05, + "loss": 1.1961, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.861146899634231, + "learning_rate": 8e-05, + "loss": 1.3099, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.7573496664517325, + "learning_rate": 0.0001, + "loss": 1.1346, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6407078764092344, + "learning_rate": 0.00012, + "loss": 0.9372, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.9333053900472983, + "learning_rate": 0.00014, + "loss": 1.1144, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.782816581375092, + "learning_rate": 0.00016, + "loss": 1.137, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.8777223543755927, + "learning_rate": 0.00018, + "loss": 1.0674, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.651514415869391, + "learning_rate": 0.0002, + "loss": 1.0372, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6565878272211279, + "learning_rate": 0.00019999458931878073, + "loss": 0.9806, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5886485478480157, + "learning_rate": 0.0001999783578606323, + "loss": 0.9688, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.6815061121637854, + "learning_rate": 0.00019995130738201966, + "loss": 0.9581, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.7281105653793167, + "learning_rate": 0.0001999134408101731, + "loss": 1.036, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.6914179263697914, + "learning_rate": 0.00019986476224277165, + "loss": 1.0213, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.6208634408736601, + "learning_rate": 0.00019980527694749952, + "loss": 0.9661, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.6139650553863405, + "learning_rate": 0.00019973499136147606, + "loss": 0.9464, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5516620513298294, + "learning_rate": 0.0001996539130905593, + "loss": 0.9267, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.6442118581390293, + "learning_rate": 0.0001995620509085228, + "loss": 0.8928, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.5685803755480052, + "learning_rate": 0.00019945941475610623, + "loss": 0.9578, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.7428802362494477, + "learning_rate": 0.0001993460157399396, + "loss": 0.9129, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5841251723976262, + "learning_rate": 0.0001992218661313415, + "loss": 0.962, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5847566191321127, + "learning_rate": 0.00019908697936499103, + "loss": 0.9113, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.602952800315588, + "learning_rate": 0.00019894137003747403, + "loss": 0.9482, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.6574185458555385, + "learning_rate": 0.00019878505390570362, + "loss": 0.9703, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6092484268483226, + "learning_rate": 0.00019861804788521493, + "loss": 1.0078, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5396328498627039, + "learning_rate": 0.00019844037004833473, + "loss": 0.8152, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5950775899897824, + "learning_rate": 0.00019825203962222572, + "loss": 1.0315, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.5400467879636469, + "learning_rate": 0.0001980530769868059, + "loss": 0.9399, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.5288979794543393, + "learning_rate": 0.00019784350367254322, + "loss": 0.8464, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5527743229596785, + "learning_rate": 0.0001976233423581255, + "loss": 0.9577, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5049275012787766, + "learning_rate": 0.0001973926168680066, + "loss": 0.9382, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5091690367695947, + "learning_rate": 0.00019715135216982798, + "loss": 0.8665, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5318560761074015, + "learning_rate": 0.0001968995743717171, + "loss": 0.9406, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.48159325375001594, + "learning_rate": 0.00019663731071946206, + "loss": 0.9629, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5552052505514862, + "learning_rate": 0.00019636458959356316, + "loss": 0.931, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.600134726583001, + "learning_rate": 0.0001960814405061619, + "loss": 0.9799, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4560541668624478, + "learning_rate": 0.00019578789409784727, + "loss": 0.831, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.47044662781439345, + "learning_rate": 0.00019548398213434007, + "loss": 0.8897, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.5810552708976553, + "learning_rate": 0.00019516973750305532, + "loss": 0.9074, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5406383322389806, + "learning_rate": 0.00019484519420954354, + "loss": 0.8683, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4981624487582689, + "learning_rate": 0.00019451038737381077, + "loss": 0.9491, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.689665279627117, + "learning_rate": 0.00019416535322651818, + "loss": 0.9629, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5207432914579827, + "learning_rate": 0.00019381012910506146, + "loss": 0.9627, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.5036481369902529, + "learning_rate": 0.00019344475344953012, + "loss": 0.8391, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.7302880468144016, + "learning_rate": 0.00019306926579854821, + "loss": 0.9593, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.592163044629266, + "learning_rate": 0.00019268370678499533, + "loss": 1.0679, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5177981539894086, + "learning_rate": 0.0001922881181316097, + "loss": 0.9884, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.6349866730973226, + "learning_rate": 0.00019188254264647337, + "loss": 0.9047, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.7610952787085876, + "learning_rate": 0.0001914670242183795, + "loss": 0.9718, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.48386518707786086, + "learning_rate": 0.0001910416078120832, + "loss": 0.7806, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5318455453071542, + "learning_rate": 0.0001906063394634356, + "loss": 0.8641, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5160916396118949, + "learning_rate": 0.00019016126627440237, + "loss": 0.9017, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.6283675091205313, + "learning_rate": 0.00018970643640796642, + "loss": 0.9637, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.5174956315565866, + "learning_rate": 0.000189241899082916, + "loss": 0.941, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5948477232434937, + "learning_rate": 0.00018876770456851877, + "loss": 0.9142, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.6144641888097007, + "learning_rate": 0.0001882839041790818, + "loss": 0.9617, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5411374031190403, + "learning_rate": 0.00018779055026839868, + "loss": 0.9119, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4551477650356518, + "learning_rate": 0.00018728769622408423, + "loss": 0.8273, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.7539290905710584, + "learning_rate": 0.00018677539646179707, + "loss": 0.9478, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4455975550076356, + "learning_rate": 0.00018625370641935129, + "loss": 0.8896, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.6358718236473844, + "learning_rate": 0.00018572268255071718, + "loss": 1.0483, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5049143290842381, + "learning_rate": 0.00018518238231991218, + "loss": 0.8487, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4702456070831088, + "learning_rate": 0.00018463286419478255, + "loss": 0.7622, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.4834735307275798, + "learning_rate": 0.00018407418764067627, + "loss": 0.7963, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5725717059675735, + "learning_rate": 0.00018350641311400812, + "loss": 0.8986, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.6255809201436507, + "learning_rate": 0.0001829296020557174, + "loss": 0.9931, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5242574559465072, + "learning_rate": 0.00018234381688461942, + "loss": 0.827, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.6646028707343711, + "learning_rate": 0.0001817491209906506, + "loss": 0.8703, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.5197197534999193, + "learning_rate": 0.00018114557872800905, + "loss": 0.9064, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.5344677586822248, + "learning_rate": 0.00018053325540819045, + "loss": 0.9058, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5109696739058583, + "learning_rate": 0.0001799122172929206, + "loss": 0.8375, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5431977826414388, + "learning_rate": 0.00017928253158698473, + "loss": 0.8769, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5261171550626805, + "learning_rate": 0.0001786442664309554, + "loss": 0.873, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.5063472698603411, + "learning_rate": 0.0001779974908938184, + "loss": 0.8654, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5241634121899378, + "learning_rate": 0.0001773422749654988, + "loss": 0.8807, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5089293613853708, + "learning_rate": 0.00017667868954928694, + "loss": 0.797, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.7184272701984563, + "learning_rate": 0.00017600680645416583, + "loss": 1.046, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.6345040707354779, + "learning_rate": 0.00017532669838704035, + "loss": 1.0161, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.5519854710124368, + "learning_rate": 0.00017463843894486937, + "loss": 0.8733, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.7052547552561359, + "learning_rate": 0.0001739421026067017, + "loss": 0.8798, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.569885331187361, + "learning_rate": 0.00017323776472561627, + "loss": 0.9914, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.47943110178092274, + "learning_rate": 0.00017252550152056795, + "loss": 0.8458, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4777715513897952, + "learning_rate": 0.0001718053900681397, + "loss": 0.9045, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.4911601274902315, + "learning_rate": 0.00017107750829420176, + "loss": 0.9083, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4280825855771144, + "learning_rate": 0.00017034193496547902, + "loss": 0.7424, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.45240547298580386, + "learning_rate": 0.00016959874968102735, + "loss": 0.8539, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5183528433389661, + "learning_rate": 0.00016884803286362, + "loss": 0.8627, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4560751891623164, + "learning_rate": 0.00016808986575104465, + "loss": 0.8303, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.4977422603317782, + "learning_rate": 0.00016732433038731242, + "loss": 0.8397, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.49015779978900487, + "learning_rate": 0.0001665515096137797, + "loss": 0.8449, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5016378352074524, + "learning_rate": 0.00016577148706018328, + "loss": 0.7354, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.563044312167859, + "learning_rate": 0.00016498434713559088, + "loss": 0.862, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.5275138404498927, + "learning_rate": 0.00016419017501926656, + "loss": 0.8566, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.707942148471608, + "learning_rate": 0.0001633890566514535, + "loss": 1.004, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4545736299260399, + "learning_rate": 0.00016258107872407375, + "loss": 0.8028, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5464208281333135, + "learning_rate": 0.0001617663286713474, + "loss": 0.7689, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4754134333479607, + "learning_rate": 0.00016094489466033043, + "loss": 0.7514, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.6104595469568889, + "learning_rate": 0.00016011686558137448, + "loss": 0.9243, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.5211974739858017, + "learning_rate": 0.0001592823310385073, + "loss": 0.9002, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.6364402922521777, + "learning_rate": 0.0001584413813397364, + "loss": 0.8828, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.48631460430578, + "learning_rate": 0.00015759410748727662, + "loss": 0.8656, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5447146038021392, + "learning_rate": 0.00015674060116770236, + "loss": 0.9, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.6067643448503521, + "learning_rate": 0.00015588095474202595, + "loss": 0.8729, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.5425159273352543, + "learning_rate": 0.00015501526123570277, + "loss": 0.9745, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.6479741166126189, + "learning_rate": 0.00015414361432856475, + "loss": 0.9822, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.5558191947311435, + "learning_rate": 0.0001532661083446829, + "loss": 0.7664, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4586689678505516, + "learning_rate": 0.00015238283824216015, + "loss": 0.6948, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.5896943261591338, + "learning_rate": 0.00015149389960285558, + "loss": 0.8536, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.5473809693844722, + "learning_rate": 0.00015059938862204127, + "loss": 0.9329, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.506608463345342, + "learning_rate": 0.00014969940209799248, + "loss": 0.7825, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4707113693392528, + "learning_rate": 0.00014879403742151283, + "loss": 0.8198, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.45623380739597186, + "learning_rate": 0.00014788339256539544, + "loss": 0.8045, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.530383958977767, + "learning_rate": 0.0001469675660738206, + "loss": 0.9472, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.5112192224272184, + "learning_rate": 0.00014604665705169237, + "loss": 0.8063, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5574429216569449, + "learning_rate": 0.00014512076515391375, + "loss": 0.9055, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.6608037662364803, + "learning_rate": 0.00014418999057460276, + "loss": 0.9565, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.539991861126441, + "learning_rate": 0.0001432544340362501, + "loss": 0.8963, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.7379756808759644, + "learning_rate": 0.00014231419677881966, + "loss": 1.091, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.45346975391486566, + "learning_rate": 0.00014136938054879283, + "loss": 0.7861, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5412071083205857, + "learning_rate": 0.00014042008758815818, + "loss": 0.9098, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.5689976450882609, + "learning_rate": 0.00013946642062334766, + "loss": 0.881, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.8071897773552312, + "learning_rate": 0.00013850848285411994, + "loss": 0.8521, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.6079098990009498, + "learning_rate": 0.000137546377942393, + "loss": 0.9552, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.5213294498819157, + "learning_rate": 0.00013658021000102636, + "loss": 0.9346, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.6003728425562892, + "learning_rate": 0.00013561008358255468, + "loss": 0.8904, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.5273050145621091, + "learning_rate": 0.00013463610366787392, + "loss": 0.817, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.48841372848628767, + "learning_rate": 0.00013365837565488064, + "loss": 0.8343, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.6471361649530032, + "learning_rate": 0.0001326770053470668, + "loss": 0.9767, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.5408628642258498, + "learning_rate": 0.0001316920989420703, + "loss": 0.9073, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.5114489126327357, + "learning_rate": 0.00013070376302018287, + "loss": 0.8789, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5470267907068778, + "learning_rate": 0.00012971210453281674, + "loss": 0.9001, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4802931290730935, + "learning_rate": 0.000128717230790931, + "loss": 0.7543, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.5839884280401538, + "learning_rate": 0.00012771924945341906, + "loss": 0.9524, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.4733573558278859, + "learning_rate": 0.00012671826851545851, + "loss": 0.8034, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.6945277000815755, + "learning_rate": 0.0001257143962968246, + "loss": 0.8441, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.47867356502676395, + "learning_rate": 0.00012470774143016853, + "loss": 0.8375, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.5849597006842344, + "learning_rate": 0.00012369841284926188, + "loss": 0.796, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4918697723988388, + "learning_rate": 0.00012268651977720866, + "loss": 0.8589, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.5010605168562106, + "learning_rate": 0.00012167217171462566, + "loss": 0.8102, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4891016643938588, + "learning_rate": 0.0001206554784277931, + "loss": 0.8983, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.6208987333820601, + "learning_rate": 0.00011963654993677645, + "loss": 0.8718, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5285129096546756, + "learning_rate": 0.00011861549650352069, + "loss": 0.9588, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4776186363050293, + "learning_rate": 0.00011759242861991855, + "loss": 0.8365, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.49270178475287485, + "learning_rate": 0.00011656745699585371, + "loss": 0.793, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4875495251241639, + "learning_rate": 0.00011554069254722051, + "loss": 0.9373, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.545069187297972, + "learning_rate": 0.00011451224638392129, + "loss": 0.8501, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4626572610071826, + "learning_rate": 0.00011348222979784289, + "loss": 0.8283, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.49533257627313665, + "learning_rate": 0.00011245075425081328, + "loss": 0.7813, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.510856931550238, + "learning_rate": 0.00011141793136253986, + "loss": 0.8225, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4392196971414571, + "learning_rate": 0.0001103838728985307, + "loss": 0.8786, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.49520723985099946, + "learning_rate": 0.000109348690758, + "loss": 0.7656, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5306397017302849, + "learning_rate": 0.00010831249696175918, + "loss": 0.8549, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.45603793666882725, + "learning_rate": 0.0001072754036400944, + "loss": 0.804, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.4244887206201728, + "learning_rate": 0.00010623752302063283, + "loss": 0.7726, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.40132261986474643, + "learning_rate": 0.00010519896741619803, + "loss": 0.7778, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 1.2936543266675542, + "learning_rate": 0.00010415984921265609, + "loss": 0.9148, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.47889573076766523, + "learning_rate": 0.00010312028085675391, + "loss": 0.8071, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.5612261063347095, + "learning_rate": 0.00010208037484395114, + "loss": 0.8822, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.5273885488245399, + "learning_rate": 0.00010104024370624644, + "loss": 0.8306, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4717948350536635, + "learning_rate": 0.0001, + "loss": 0.69, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4840408762889548, + "learning_rate": 9.895975629375359e-05, + "loss": 0.754, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6221134254293921, + "learning_rate": 9.791962515604887e-05, + "loss": 0.879, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.45100027191191416, + "learning_rate": 9.687971914324607e-05, + "loss": 0.747, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.8339067998577471, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8372, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5589104243615118, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8438, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4114548028627618, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7871, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.480650441623415, + "learning_rate": 9.272459635990562e-05, + "loss": 0.8283, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.6432633248760891, + "learning_rate": 9.168750303824084e-05, + "loss": 0.9192, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.5534300778342244, + "learning_rate": 9.065130924199998e-05, + "loss": 0.9246, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5234553987672043, + "learning_rate": 8.961612710146934e-05, + "loss": 0.803, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.5076566664174726, + "learning_rate": 8.858206863746018e-05, + "loss": 0.831, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.5590093677334266, + "learning_rate": 8.754924574918675e-05, + "loss": 0.9798, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4781234381096328, + "learning_rate": 8.651777020215712e-05, + "loss": 0.8182, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.5475350234616201, + "learning_rate": 8.548775361607872e-05, + "loss": 0.9335, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.5220333579306987, + "learning_rate": 8.445930745277953e-05, + "loss": 0.9167, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.5061996867916019, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7998, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.7067608388548705, + "learning_rate": 8.240757138008149e-05, + "loss": 0.9451, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4748422317049764, + "learning_rate": 8.138450349647936e-05, + "loss": 0.8049, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.5141359500013931, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8825, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4538449077038628, + "learning_rate": 7.934452157220694e-05, + "loss": 0.8246, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5812532103713188, + "learning_rate": 7.832782828537437e-05, + "loss": 0.93, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.40602200816110934, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7816, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.44425664814082716, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8097, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.6300211393176885, + "learning_rate": 7.52922585698315e-05, + "loss": 0.8242, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4531748578835238, + "learning_rate": 7.428560370317542e-05, + "loss": 0.7426, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.792622749855944, + "learning_rate": 7.328173148454151e-05, + "loss": 0.6975, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.39359733135436836, + "learning_rate": 7.228075054658096e-05, + "loss": 0.8199, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.6852950223585511, + "learning_rate": 7.1282769209069e-05, + "loss": 1.0539, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.5384589258458015, + "learning_rate": 7.028789546718326e-05, + "loss": 0.8291, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4324933046911287, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7547, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.468709727430381, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7981, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.6803853686918507, + "learning_rate": 6.732299465293322e-05, + "loss": 0.9394, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4825189840086367, + "learning_rate": 6.63416243451194e-05, + "loss": 0.8202, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.5144186539202125, + "learning_rate": 6.536389633212609e-05, + "loss": 0.7711, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4263396250132595, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8088, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.49941845430127096, + "learning_rate": 6.341978999897365e-05, + "loss": 0.8169, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.5682021495903111, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7438, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.537687101429346, + "learning_rate": 6.149151714588009e-05, + "loss": 0.8825, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.4635947680984124, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7918, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.558291688101491, + "learning_rate": 5.957991241184184e-05, + "loss": 0.8062, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4657827288564771, + "learning_rate": 5.863061945120719e-05, + "loss": 0.878, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.6475849514054947, + "learning_rate": 5.768580322118034e-05, + "loss": 0.8601, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4721925507581555, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8426, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.625925634093718, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.9321, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.7033747672815854, + "learning_rate": 5.487923484608629e-05, + "loss": 0.9268, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.39977159046362765, + "learning_rate": 5.395334294830765e-05, + "loss": 0.6701, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.46359917113099386, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.8442, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4751301690283989, + "learning_rate": 5.211660743460458e-05, + "loss": 0.8074, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.5337082674747815, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.864, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.5196500203640496, + "learning_rate": 5.030059790200756e-05, + "loss": 0.8015, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5112184951236864, + "learning_rate": 4.940061137795876e-05, + "loss": 0.8287, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5304281425423548, + "learning_rate": 4.850610039714444e-05, + "loss": 0.8749, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5528623490299155, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8318, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.6111246948996016, + "learning_rate": 4.673389165531714e-05, + "loss": 0.9082, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4635896756486311, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8146, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.40035242880443134, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7578, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.45841192895687083, + "learning_rate": 4.411904525797408e-05, + "loss": 0.8402, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4856982628923901, + "learning_rate": 4.325939883229766e-05, + "loss": 0.883, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.47289227158961605, + "learning_rate": 4.240589251272342e-05, + "loss": 0.776, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.47530983676727984, + "learning_rate": 4.155861866026364e-05, + "loss": 0.8598, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4884455177461908, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7445, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.44316312426098886, + "learning_rate": 3.988313441862553e-05, + "loss": 0.734, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5030904138998532, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.8837, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.6038132471996558, + "learning_rate": 3.823367132865265e-05, + "loss": 0.8938, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5260197404776972, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7718, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.34691547214333723, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.6221, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.5275459072919113, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7895, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.5570304110125733, + "learning_rate": 3.501565286440914e-05, + "loss": 0.8978, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.6065567996234003, + "learning_rate": 3.422851293981676e-05, + "loss": 0.8329, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.523924582372723, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.8316, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5781104737849966, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.872, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.6195051512406304, + "learning_rate": 3.191013424895536e-05, + "loss": 0.8944, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5125990141611612, + "learning_rate": 3.115196713638e-05, + "loss": 0.8307, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.4176635222803518, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7378, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4996266471649693, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7762, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4586927896452429, + "learning_rate": 2.892249170579826e-05, + "loss": 0.8369, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4747263380295228, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7604, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4513328285687335, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7674, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.4824499498411078, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.8047, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.44532563946804304, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7169, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4534563140397689, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7372, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5092400552652228, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.878, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.48484441028887837, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8378, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.49988786684234965, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.8989, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5036027600886066, + "learning_rate": 2.265772503450122e-05, + "loss": 0.8252, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.5663754997840815, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.813, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.48522149898487005, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7892, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.498470979551735, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7893, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.43696460497769457, + "learning_rate": 2.008778270707944e-05, + "loss": 0.6798, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5250712861371837, + "learning_rate": 1.946674459180955e-05, + "loss": 0.8043, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.38732002126004367, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.6926, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.5840927286804334, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7869, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.41480769742238666, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.742, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.46402711009604686, + "learning_rate": 1.707039794428259e-05, + "loss": 0.8465, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.4769618114279666, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7962, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.49153701380015713, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.8428, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.4583848547396149, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7666, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4537547977690445, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7135, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.4432071389256633, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.804, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5707491083230274, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.8564, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3892745505208706, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.6756, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4258721911969603, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7472, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.46365446554984235, + "learning_rate": 1.220944973160133e-05, + "loss": 0.8042, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.456740543797104, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.8339, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.547086237192044, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.776, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.44636264565543116, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.8197, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.47417519053417406, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.8621, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5291019588943826, + "learning_rate": 9.838733725597615e-06, + "loss": 0.9103, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.6565436691143302, + "learning_rate": 9.393660536564408e-06, + "loss": 1.0123, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.6482087047353086, + "learning_rate": 8.958392187916841e-06, + "loss": 0.9716, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5350159462679187, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7482, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.5034986091969066, + "learning_rate": 8.117457353526625e-06, + "loss": 0.876, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.46017174925372106, + "learning_rate": 7.711881868390291e-06, + "loss": 0.8034, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.5321986712064487, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.8218, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4811734868233411, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7031, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.6346894353744286, + "learning_rate": 6.555246550469907e-06, + "loss": 0.9176, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4838896705858339, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7323, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.5820365796986088, + "learning_rate": 5.834646773481811e-06, + "loss": 0.8958, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.4362300861578015, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6991, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.5078369607852672, + "learning_rate": 5.154805790456485e-06, + "loss": 0.8024, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.40260744239183127, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7251, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.37593623442077545, + "learning_rate": 4.516017865659949e-06, + "loss": 0.6507, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.5456906879732161, + "learning_rate": 4.21210590215273e-06, + "loss": 0.8119, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.47180073161871877, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7475, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4958689938065469, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.8696, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4308524491770669, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7467, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.44350428945291354, + "learning_rate": 3.100425628282899e-06, + "loss": 0.766, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.47896882190111284, + "learning_rate": 2.848647830172024e-06, + "loss": 0.803, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.48900554232333243, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7949, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5539089973209115, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.8044, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.41239761860155166, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7603, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4819436482045979, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7729, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.672490555461374, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.9055, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.5497856014564447, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7662, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.7769076374803492, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.935, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.45798290239142614, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.8093, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.45082261847579425, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7599, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4849731846586711, + "learning_rate": 9.130206350089765e-07, + "loss": 0.8297, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.5562776995082223, + "learning_rate": 7.781338686584927e-07, + "loss": 0.8792, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.7246955746191942, + "learning_rate": 6.539842600603918e-07, + "loss": 0.8973, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4508444268939336, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7073, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4665854388410085, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.824, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4544600876679446, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7617, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.49710907207792343, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7671, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4276600021670932, + "learning_rate": 1.947230525005006e-07, + "loss": 0.6997, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.556101610395927, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.8193, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.429165560718107, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6873, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.579147630975659, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.8657, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.4764623215495487, + "learning_rate": 2.164213936770576e-08, + "loss": 0.8788, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5039653874346466, + "learning_rate": 5.410681219286673e-09, + "loss": 0.8246, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.471304187050522, + "learning_rate": 0.0, + "loss": 0.734, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 254243116089344.0, + "train_loss": 0.8617107205283947, + "train_runtime": 4634.4498, + "train_samples_per_second": 1.079, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 254243116089344.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a1b26538b2af09e21a3409e03eb76a8752d09a93 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "down_proj", + "k_proj", + "up_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a54a5c280e1f81567f50eae7b177ac61279635f --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63974bbae1ea49c85621abd1c04a4c1304ee81668d211a5baa370d67b275592c +size 671150064 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..933e8161a20b468cfc065e95580dc2c31e3aba4e --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:014a18c938c8cb914531b9540f22824a872b48e0251fcb4bb5c302cd7fa4ce5b +size 918507402 diff --git a/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..88d324766ad908a6cba631af3d3351e763ed5bee --- /dev/null +++ b/mixing_strategies/Equal/bugsBunny-v1_1-Llama-3-8B-V-Equal_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,1134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064, + "grad_norm": 0.805546322586046, + "learning_rate": 4e-05, + "loss": 1.3165, + "step": 1 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7545839075789395, + "learning_rate": 8e-05, + "loss": 1.3077, + "step": 2 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5409861715811655, + "learning_rate": 0.00012, + "loss": 1.1229, + "step": 3 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6197681443183749, + "learning_rate": 0.00016, + "loss": 1.1844, + "step": 4 + }, + { + "epoch": 0.032, + "grad_norm": 0.889292974821854, + "learning_rate": 0.0002, + "loss": 1.1847, + "step": 5 + }, + { + "epoch": 0.0384, + "grad_norm": 0.72954475633058, + "learning_rate": 0.0001999783578606323, + "loss": 1.0351, + "step": 6 + }, + { + "epoch": 0.0448, + "grad_norm": 0.562481463872672, + "learning_rate": 0.0001999134408101731, + "loss": 1.0284, + "step": 7 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4542647162185984, + "learning_rate": 0.00019980527694749952, + "loss": 1.0089, + "step": 8 + }, + { + "epoch": 0.0576, + "grad_norm": 0.44881440081227303, + "learning_rate": 0.0001996539130905593, + "loss": 0.9525, + "step": 9 + }, + { + "epoch": 0.064, + "grad_norm": 0.4573316397402673, + "learning_rate": 0.00019945941475610623, + "loss": 0.934, + "step": 10 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5731092163782907, + "learning_rate": 0.0001992218661313415, + "loss": 0.9563, + "step": 11 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5582550935793705, + "learning_rate": 0.00019894137003747403, + "loss": 0.9484, + "step": 12 + }, + { + "epoch": 0.0832, + "grad_norm": 0.48101884469079026, + "learning_rate": 0.00019861804788521493, + "loss": 1.0022, + "step": 13 + }, + { + "epoch": 0.0896, + "grad_norm": 0.44489440028982064, + "learning_rate": 0.00019825203962222572, + "loss": 0.9294, + "step": 14 + }, + { + "epoch": 0.096, + "grad_norm": 0.4140640546403325, + "learning_rate": 0.00019784350367254322, + "loss": 0.9036, + "step": 15 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4452589302938284, + "learning_rate": 0.0001973926168680066, + "loss": 0.9623, + "step": 16 + }, + { + "epoch": 0.1088, + "grad_norm": 0.39179819506926955, + "learning_rate": 0.0001968995743717171, + "loss": 0.9067, + "step": 17 + }, + { + "epoch": 0.1152, + "grad_norm": 0.39366752200691585, + "learning_rate": 0.00019636458959356316, + "loss": 0.9528, + "step": 18 + }, + { + "epoch": 0.1216, + "grad_norm": 0.39803846092630674, + "learning_rate": 0.00019578789409784727, + "loss": 0.9095, + "step": 19 + }, + { + "epoch": 0.128, + "grad_norm": 0.4125166760813841, + "learning_rate": 0.00019516973750305532, + "loss": 0.9084, + "step": 20 + }, + { + "epoch": 0.1344, + "grad_norm": 0.367045512037434, + "learning_rate": 0.00019451038737381077, + "loss": 0.9115, + "step": 21 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4204643990614447, + "learning_rate": 0.00019381012910506146, + "loss": 0.9652, + "step": 22 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4234540671955711, + "learning_rate": 0.00019306926579854821, + "loss": 0.9077, + "step": 23 + }, + { + "epoch": 0.1536, + "grad_norm": 0.41848674883765574, + "learning_rate": 0.0001922881181316097, + "loss": 1.0333, + "step": 24 + }, + { + "epoch": 0.16, + "grad_norm": 0.5322952744746656, + "learning_rate": 0.0001914670242183795, + "loss": 0.9422, + "step": 25 + }, + { + "epoch": 0.1664, + "grad_norm": 0.37120088149684727, + "learning_rate": 0.0001906063394634356, + "loss": 0.8186, + "step": 26 + }, + { + "epoch": 0.1728, + "grad_norm": 0.6759481035130391, + "learning_rate": 0.00018970643640796642, + "loss": 0.9278, + "step": 27 + }, + { + "epoch": 0.1792, + "grad_norm": 0.38228002308368947, + "learning_rate": 0.00018876770456851877, + "loss": 0.9193, + "step": 28 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3865683631565096, + "learning_rate": 0.00018779055026839868, + "loss": 0.9207, + "step": 29 + }, + { + "epoch": 0.192, + "grad_norm": 0.3965746057935295, + "learning_rate": 0.00018677539646179707, + "loss": 0.8772, + "step": 30 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3858422141117998, + "learning_rate": 0.00018572268255071718, + "loss": 0.9648, + "step": 31 + }, + { + "epoch": 0.2048, + "grad_norm": 0.35659419025313677, + "learning_rate": 0.00018463286419478255, + "loss": 0.8024, + "step": 32 + }, + { + "epoch": 0.2112, + "grad_norm": 0.39768086056016644, + "learning_rate": 0.00018350641311400812, + "loss": 0.8391, + "step": 33 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4335853573049478, + "learning_rate": 0.00018234381688461942, + "loss": 0.9007, + "step": 34 + }, + { + "epoch": 0.224, + "grad_norm": 0.4180259491270757, + "learning_rate": 0.00018114557872800905, + "loss": 0.8795, + "step": 35 + }, + { + "epoch": 0.2304, + "grad_norm": 0.36587131891456703, + "learning_rate": 0.0001799122172929206, + "loss": 0.8591, + "step": 36 + }, + { + "epoch": 0.2368, + "grad_norm": 0.39498933797878183, + "learning_rate": 0.0001786442664309554, + "loss": 0.8652, + "step": 37 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3948791811440726, + "learning_rate": 0.0001773422749654988, + "loss": 0.8694, + "step": 38 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4665878754601238, + "learning_rate": 0.00017600680645416583, + "loss": 0.9099, + "step": 39 + }, + { + "epoch": 0.256, + "grad_norm": 0.4417260562598781, + "learning_rate": 0.00017463843894486937, + "loss": 0.9309, + "step": 40 + }, + { + "epoch": 0.2624, + "grad_norm": 0.41926910617947233, + "learning_rate": 0.00017323776472561627, + "loss": 0.9185, + "step": 41 + }, + { + "epoch": 0.2688, + "grad_norm": 0.33804324283430204, + "learning_rate": 0.0001718053900681397, + "loss": 0.8653, + "step": 42 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3251706874587237, + "learning_rate": 0.00017034193496547902, + "loss": 0.8152, + "step": 43 + }, + { + "epoch": 0.2816, + "grad_norm": 0.34873519250922863, + "learning_rate": 0.00016884803286362, + "loss": 0.8466, + "step": 44 + }, + { + "epoch": 0.288, + "grad_norm": 0.3374558679907544, + "learning_rate": 0.00016732433038731242, + "loss": 0.8324, + "step": 45 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3370814914034644, + "learning_rate": 0.00016577148706018328, + "loss": 0.7825, + "step": 46 + }, + { + "epoch": 0.3008, + "grad_norm": 0.39413321334376433, + "learning_rate": 0.00016419017501926656, + "loss": 0.8575, + "step": 47 + }, + { + "epoch": 0.3072, + "grad_norm": 0.37798799795124327, + "learning_rate": 0.00016258107872407375, + "loss": 0.8946, + "step": 48 + }, + { + "epoch": 0.3136, + "grad_norm": 0.37049011642321633, + "learning_rate": 0.00016094489466033043, + "loss": 0.7573, + "step": 49 + }, + { + "epoch": 0.32, + "grad_norm": 0.38681120648438766, + "learning_rate": 0.0001592823310385073, + "loss": 0.9045, + "step": 50 + }, + { + "epoch": 0.3264, + "grad_norm": 0.40835086499060924, + "learning_rate": 0.00015759410748727662, + "loss": 0.867, + "step": 51 + }, + { + "epoch": 0.3328, + "grad_norm": 0.413703584869729, + "learning_rate": 0.00015588095474202595, + "loss": 0.8734, + "step": 52 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4309517025934206, + "learning_rate": 0.00015414361432856475, + "loss": 0.9699, + "step": 53 + }, + { + "epoch": 0.3456, + "grad_norm": 0.36248918663875634, + "learning_rate": 0.00015238283824216015, + "loss": 0.7183, + "step": 54 + }, + { + "epoch": 0.352, + "grad_norm": 0.4062156464684849, + "learning_rate": 0.00015059938862204127, + "loss": 0.8833, + "step": 55 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3370276280908736, + "learning_rate": 0.00014879403742151283, + "loss": 0.7916, + "step": 56 + }, + { + "epoch": 0.3648, + "grad_norm": 0.35014515823892844, + "learning_rate": 0.0001469675660738206, + "loss": 0.8683, + "step": 57 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3783891730128175, + "learning_rate": 0.00014512076515391375, + "loss": 0.8471, + "step": 58 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4570925963839151, + "learning_rate": 0.0001432544340362501, + "loss": 0.9177, + "step": 59 + }, + { + "epoch": 0.384, + "grad_norm": 0.4163069571860744, + "learning_rate": 0.00014136938054879283, + "loss": 0.9249, + "step": 60 + }, + { + "epoch": 0.3904, + "grad_norm": 0.40422561275720553, + "learning_rate": 0.00013946642062334766, + "loss": 0.8832, + "step": 61 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4052479343056615, + "learning_rate": 0.000137546377942393, + "loss": 0.8903, + "step": 62 + }, + { + "epoch": 0.4032, + "grad_norm": 0.38958133602024597, + "learning_rate": 0.00013561008358255468, + "loss": 0.8973, + "step": 63 + }, + { + "epoch": 0.4096, + "grad_norm": 0.6475031242958649, + "learning_rate": 0.00013365837565488064, + "loss": 0.821, + "step": 64 + }, + { + "epoch": 0.416, + "grad_norm": 0.44626049108693816, + "learning_rate": 0.0001316920989420703, + "loss": 0.9273, + "step": 65 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3812837364086625, + "learning_rate": 0.00012971210453281674, + "loss": 0.8877, + "step": 66 + }, + { + "epoch": 0.4288, + "grad_norm": 0.36872511821962134, + "learning_rate": 0.00012771924945341906, + "loss": 0.8404, + "step": 67 + }, + { + "epoch": 0.4352, + "grad_norm": 0.44849089068023396, + "learning_rate": 0.0001257143962968246, + "loss": 0.8171, + "step": 68 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3835328292811916, + "learning_rate": 0.00012369841284926188, + "loss": 0.8085, + "step": 69 + }, + { + "epoch": 0.448, + "grad_norm": 0.3511263955012711, + "learning_rate": 0.00012167217171462566, + "loss": 0.8205, + "step": 70 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4207355319366397, + "learning_rate": 0.00011963654993677645, + "loss": 0.8753, + "step": 71 + }, + { + "epoch": 0.4608, + "grad_norm": 0.37549463268898003, + "learning_rate": 0.00011759242861991855, + "loss": 0.8874, + "step": 72 + }, + { + "epoch": 0.4672, + "grad_norm": 0.36121802031305045, + "learning_rate": 0.00011554069254722051, + "loss": 0.8625, + "step": 73 + }, + { + "epoch": 0.4736, + "grad_norm": 0.37456200615779306, + "learning_rate": 0.00011348222979784289, + "loss": 0.8337, + "step": 74 + }, + { + "epoch": 0.48, + "grad_norm": 0.3643015422631111, + "learning_rate": 0.00011141793136253986, + "loss": 0.7948, + "step": 75 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3254749605120823, + "learning_rate": 0.000109348690758, + "loss": 0.8123, + "step": 76 + }, + { + "epoch": 0.4928, + "grad_norm": 0.36585814792272303, + "learning_rate": 0.0001072754036400944, + "loss": 0.8231, + "step": 77 + }, + { + "epoch": 0.4992, + "grad_norm": 0.30338389028077356, + "learning_rate": 0.00010519896741619803, + "loss": 0.7762, + "step": 78 + }, + { + "epoch": 0.5056, + "grad_norm": 0.541897142047526, + "learning_rate": 0.00010312028085675391, + "loss": 0.8549, + "step": 79 + }, + { + "epoch": 0.512, + "grad_norm": 0.3943300521754534, + "learning_rate": 0.00010104024370624644, + "loss": 0.8574, + "step": 80 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3297066953221598, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7177, + "step": 81 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3831654106693363, + "learning_rate": 9.687971914324607e-05, + "loss": 0.8078, + "step": 82 + }, + { + "epoch": 0.5312, + "grad_norm": 0.34403521876366816, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8362, + "step": 83 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3218284170360931, + "learning_rate": 9.272459635990562e-05, + "loss": 0.8062, + "step": 84 + }, + { + "epoch": 0.544, + "grad_norm": 0.421195702498038, + "learning_rate": 9.065130924199998e-05, + "loss": 0.9156, + "step": 85 + }, + { + "epoch": 0.5504, + "grad_norm": 0.38098554789749073, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8168, + "step": 86 + }, + { + "epoch": 0.5568, + "grad_norm": 0.40882368036803046, + "learning_rate": 8.651777020215712e-05, + "loss": 0.9008, + "step": 87 + }, + { + "epoch": 0.5632, + "grad_norm": 0.384069393195924, + "learning_rate": 8.445930745277953e-05, + "loss": 0.9222, + "step": 88 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4558671792764743, + "learning_rate": 8.240757138008149e-05, + "loss": 0.872, + "step": 89 + }, + { + "epoch": 0.576, + "grad_norm": 0.35614041648667805, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8417, + "step": 90 + }, + { + "epoch": 0.5824, + "grad_norm": 0.40081088790439307, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8763, + "step": 91 + }, + { + "epoch": 0.5888, + "grad_norm": 0.32197931672413255, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7946, + "step": 92 + }, + { + "epoch": 0.5952, + "grad_norm": 0.39582034739433736, + "learning_rate": 7.428560370317542e-05, + "loss": 0.783, + "step": 93 + }, + { + "epoch": 0.6016, + "grad_norm": 0.2901849264572683, + "learning_rate": 7.228075054658096e-05, + "loss": 0.761, + "step": 94 + }, + { + "epoch": 0.608, + "grad_norm": 0.45644415157439366, + "learning_rate": 7.028789546718326e-05, + "loss": 0.946, + "step": 95 + }, + { + "epoch": 0.6144, + "grad_norm": 0.35184756415709373, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7784, + "step": 96 + }, + { + "epoch": 0.6208, + "grad_norm": 0.41519392152425355, + "learning_rate": 6.63416243451194e-05, + "loss": 0.8843, + "step": 97 + }, + { + "epoch": 0.6272, + "grad_norm": 0.519841037292034, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7931, + "step": 98 + }, + { + "epoch": 0.6336, + "grad_norm": 0.38514886035821516, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7811, + "step": 99 + }, + { + "epoch": 0.64, + "grad_norm": 0.38223327718531364, + "learning_rate": 6.053357937665237e-05, + "loss": 0.8398, + "step": 100 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3620194195609992, + "learning_rate": 5.863061945120719e-05, + "loss": 0.8452, + "step": 101 + }, + { + "epoch": 0.6528, + "grad_norm": 0.40812166227656616, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8517, + "step": 102 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4850016643582673, + "learning_rate": 5.487923484608629e-05, + "loss": 0.9341, + "step": 103 + }, + { + "epoch": 0.6656, + "grad_norm": 0.31814201642044526, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7607, + "step": 104 + }, + { + "epoch": 0.672, + "grad_norm": 0.38422423276934436, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.8354, + "step": 105 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3749895963805748, + "learning_rate": 4.940061137795876e-05, + "loss": 0.8145, + "step": 106 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3892782930574169, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8538, + "step": 107 + }, + { + "epoch": 0.6912, + "grad_norm": 0.40519227449724154, + "learning_rate": 4.585638567143529e-05, + "loss": 0.866, + "step": 108 + }, + { + "epoch": 0.6976, + "grad_norm": 0.32488927233195825, + "learning_rate": 4.411904525797408e-05, + "loss": 0.801, + "step": 109 + }, + { + "epoch": 0.704, + "grad_norm": 0.3446736636811925, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8335, + "step": 110 + }, + { + "epoch": 0.7104, + "grad_norm": 0.31596911798818506, + "learning_rate": 4.071766896149273e-05, + "loss": 0.8055, + "step": 111 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3309994721343404, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.8149, + "step": 112 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3987584810803584, + "learning_rate": 3.741892127592625e-05, + "loss": 0.8379, + "step": 113 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3628992489997569, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7083, + "step": 114 + }, + { + "epoch": 0.736, + "grad_norm": 0.41185878963669303, + "learning_rate": 3.422851293981676e-05, + "loss": 0.8743, + "step": 115 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3731843655247591, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.8586, + "step": 116 + }, + { + "epoch": 0.7488, + "grad_norm": 0.38977168134862195, + "learning_rate": 3.115196713638e-05, + "loss": 0.8659, + "step": 117 + }, + { + "epoch": 0.7552, + "grad_norm": 0.33038949792135575, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7627, + "step": 118 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3300325442346146, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.8006, + "step": 119 + }, + { + "epoch": 0.768, + "grad_norm": 0.3463759373027119, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7917, + "step": 120 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3311395087967572, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7334, + "step": 121 + }, + { + "epoch": 0.7808, + "grad_norm": 0.35885955818374293, + "learning_rate": 2.399319354583418e-05, + "loss": 0.865, + "step": 122 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3650198397583705, + "learning_rate": 2.265772503450122e-05, + "loss": 0.8657, + "step": 123 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3825762896369597, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.8075, + "step": 124 + }, + { + "epoch": 0.8, + "grad_norm": 0.33881715214348657, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7396, + "step": 125 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3410826447963918, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7526, + "step": 126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3430038284191712, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.7662, + "step": 127 + }, + { + "epoch": 0.8192, + "grad_norm": 0.33893345771891265, + "learning_rate": 1.649358688599191e-05, + "loss": 0.8261, + "step": 128 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3325405498591736, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.8134, + "step": 129 + }, + { + "epoch": 0.832, + "grad_norm": 0.40576390349578906, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7655, + "step": 130 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3716539369347945, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7717, + "step": 131 + }, + { + "epoch": 0.8448, + "grad_norm": 0.32600043431681647, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7809, + "step": 132 + }, + { + "epoch": 0.8512, + "grad_norm": 0.37421174165734555, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.8124, + "step": 133 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3400461814317215, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.8469, + "step": 134 + }, + { + "epoch": 0.864, + "grad_norm": 0.4235036346319738, + "learning_rate": 9.393660536564408e-06, + "loss": 0.9632, + "step": 135 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4244963028171556, + "learning_rate": 8.532975781620512e-06, + "loss": 0.8652, + "step": 136 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3594067107402808, + "learning_rate": 7.711881868390291e-06, + "loss": 0.8433, + "step": 137 + }, + { + "epoch": 0.8832, + "grad_norm": 0.39434917822992555, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7691, + "step": 138 + }, + { + "epoch": 0.8896, + "grad_norm": 0.38861442712413796, + "learning_rate": 6.189870894938587e-06, + "loss": 0.8303, + "step": 139 + }, + { + "epoch": 0.896, + "grad_norm": 0.38875669196078927, + "learning_rate": 5.489612626189245e-06, + "loss": 0.8052, + "step": 140 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3397586653857691, + "learning_rate": 4.830262496944693e-06, + "loss": 0.772, + "step": 141 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3341366296530402, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7425, + "step": 142 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3713009002027508, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.8139, + "step": 143 + }, + { + "epoch": 0.9216, + "grad_norm": 0.30749923385219086, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7641, + "step": 144 + }, + { + "epoch": 0.928, + "grad_norm": 0.34419207913198757, + "learning_rate": 2.607383131993424e-06, + "loss": 0.8069, + "step": 145 + }, + { + "epoch": 0.9344, + "grad_norm": 0.35337499951942763, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7877, + "step": 146 + }, + { + "epoch": 0.9408, + "grad_norm": 0.44648853414876255, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.8463, + "step": 147 + }, + { + "epoch": 0.9472, + "grad_norm": 0.47502886516564236, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.8545, + "step": 148 + }, + { + "epoch": 0.9536, + "grad_norm": 0.33272757670347386, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7896, + "step": 149 + }, + { + "epoch": 0.96, + "grad_norm": 0.37941906422833727, + "learning_rate": 7.781338686584927e-07, + "loss": 0.863, + "step": 150 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4209815068037428, + "learning_rate": 5.405852438937764e-07, + "loss": 0.8091, + "step": 151 + }, + { + "epoch": 0.9728, + "grad_norm": 0.33933261841476176, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7988, + "step": 152 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4686490374433693, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7405, + "step": 153 + }, + { + "epoch": 0.9856, + "grad_norm": 0.35585369588278914, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7565, + "step": 154 + }, + { + "epoch": 0.992, + "grad_norm": 0.3957559739532033, + "learning_rate": 2.164213936770576e-08, + "loss": 0.8821, + "step": 155 + }, + { + "epoch": 0.9984, + "grad_norm": 0.34527612989533474, + "learning_rate": 0.0, + "loss": 0.7851, + "step": 156 + }, + { + "epoch": 0.9984, + "step": 156, + "total_flos": 367856468492288.0, + "train_loss": 0.8643133678497412, + "train_runtime": 4620.4934, + "train_samples_per_second": 1.082, + "train_steps_per_second": 0.034 + } + ], + "logging_steps": 1.0, + "max_steps": 156, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 367856468492288.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9c08a7236f5b6ee67240b8aaf4dec60e97c614d1 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "k_proj", + "v_proj", + "up_proj", + "gate_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9445e9b30cf8ce239c5df99919ce9826406e6938 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c4451942d3694fefe2770f4aa99d5a6f31c34de779fb84aa2e034f838cd4f98 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..ac33173b3f4ccc1a4cc9e05dc4e953e6c99119b9 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:853d5c6dbc5cf587534c0e5def614db82b0e2b8e2ad193b90e147e0a8f33e495 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cc459e2c1ad6d4ac59b40adbbbd0e9f52df04d34 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 0.9469230770785747, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4071, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9282190567648156, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4281, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.9139111931263875, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3374, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.930828366559463, + "learning_rate": 4.210526315789474e-05, + "loss": 1.4627, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.8611553397462317, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.3902, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 1.0897129436057411, + "learning_rate": 6.31578947368421e-05, + "loss": 1.2479, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 1.1752358157877894, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0697, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.6085116704511242, + "learning_rate": 8.421052631578948e-05, + "loss": 1.1268, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.9596327924478002, + "learning_rate": 9.473684210526316e-05, + "loss": 1.0964, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.692665038925857, + "learning_rate": 0.00010526315789473685, + "loss": 0.9623, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.7441815967151375, + "learning_rate": 0.00011578947368421053, + "loss": 1.0412, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6316351307258408, + "learning_rate": 0.0001263157894736842, + "loss": 0.9233, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.54827818826474, + "learning_rate": 0.0001368421052631579, + "loss": 0.9314, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5545383500276659, + "learning_rate": 0.00014736842105263158, + "loss": 0.9406, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.5872883551061585, + "learning_rate": 0.00015789473684210527, + "loss": 0.9104, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.508832380911569, + "learning_rate": 0.00016842105263157895, + "loss": 0.9962, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5630296535547067, + "learning_rate": 0.00017894736842105264, + "loss": 0.9143, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5461364557349855, + "learning_rate": 0.00018947368421052632, + "loss": 0.8706, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.540240829170405, + "learning_rate": 0.0002, + "loss": 0.9249, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.6434196307881772, + "learning_rate": 0.00019999865623437013, + "loss": 0.9895, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5154473813851905, + "learning_rate": 0.00019999462497359466, + "loss": 0.9361, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.4621933314943631, + "learning_rate": 0.00019998790632601496, + "loss": 0.8441, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4913720833594254, + "learning_rate": 0.0001999785004721968, + "loss": 0.8488, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5071218302308363, + "learning_rate": 0.00019996640766492543, + "loss": 0.8088, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.557695660574659, + "learning_rate": 0.00019995162822919883, + "loss": 0.8954, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4843281772606149, + "learning_rate": 0.00019993416256221895, + "loss": 0.8793, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5008949491258939, + "learning_rate": 0.00019991401113338104, + "loss": 0.8348, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.44404190337560834, + "learning_rate": 0.00019989117448426108, + "loss": 0.8792, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.44257702525298914, + "learning_rate": 0.00019986565322860115, + "loss": 0.8304, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.5651194264346859, + "learning_rate": 0.00019983744805229296, + "loss": 0.8417, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5898459615130699, + "learning_rate": 0.00019980655971335945, + "loss": 0.8225, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.46680207549711317, + "learning_rate": 0.00019977298904193437, + "loss": 0.8872, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5143799869364328, + "learning_rate": 0.00019973673694024, + "loss": 0.9118, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.49200438756318415, + "learning_rate": 0.00019969780438256293, + "loss": 0.8039, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.45713307022560185, + "learning_rate": 0.0001996561924152278, + "loss": 0.8551, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5201454100406916, + "learning_rate": 0.0001996119021565693, + "loss": 0.851, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4731751405020751, + "learning_rate": 0.0001995649347969019, + "loss": 0.8353, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4295612250683847, + "learning_rate": 0.00019951529159848805, + "loss": 0.8234, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.435223588864598, + "learning_rate": 0.00019946297389550433, + "loss": 0.8207, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.46549327734889817, + "learning_rate": 0.00019940798309400526, + "loss": 0.8709, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.4846646680861656, + "learning_rate": 0.0001993503206718859, + "loss": 0.8483, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5367439131274088, + "learning_rate": 0.00019928998817884182, + "loss": 0.96, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.44352326173213347, + "learning_rate": 0.00019922698723632767, + "loss": 0.8464, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.43119635750337504, + "learning_rate": 0.00019916131953751342, + "loss": 0.8601, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.4146044852574158, + "learning_rate": 0.00019909298684723904, + "loss": 0.7923, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5312928654759015, + "learning_rate": 0.00019902199100196697, + "loss": 0.8561, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4335906008977014, + "learning_rate": 0.00019894833390973266, + "loss": 0.7712, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.48655259735726974, + "learning_rate": 0.00019887201755009357, + "loss": 0.837, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.3994093879455125, + "learning_rate": 0.0001987930439740757, + "loss": 0.7707, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.4631361950212455, + "learning_rate": 0.00019871141530411853, + "loss": 0.8898, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4435082887651816, + "learning_rate": 0.0001986271337340182, + "loss": 0.7789, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.438265699705375, + "learning_rate": 0.00019854020152886814, + "loss": 0.8627, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.430215665242078, + "learning_rate": 0.0001984506210249986, + "loss": 0.7563, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5301972397142374, + "learning_rate": 0.00019835839462991361, + "loss": 0.9061, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.5131899059813476, + "learning_rate": 0.00019826352482222638, + "loss": 0.8849, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.41536687740839834, + "learning_rate": 0.00019816601415159263, + "loss": 0.7832, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.47223814289391924, + "learning_rate": 0.0001980658652386421, + "loss": 0.846, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4256995099898317, + "learning_rate": 0.00019796308077490817, + "loss": 0.7631, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.559990879732315, + "learning_rate": 0.00019785766352275542, + "loss": 0.8867, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.5043932569869316, + "learning_rate": 0.00019774961631530545, + "loss": 0.7905, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.45011008673649056, + "learning_rate": 0.00019763894205636072, + "loss": 0.8417, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4812789673709414, + "learning_rate": 0.00019752564372032657, + "loss": 0.7674, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.6103731160662064, + "learning_rate": 0.00019740972435213115, + "loss": 0.7577, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.45866189675326735, + "learning_rate": 0.00019729118706714375, + "loss": 0.8128, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.44798586420124414, + "learning_rate": 0.00019717003505109095, + "loss": 0.7607, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.42473200076232115, + "learning_rate": 0.00019704627155997108, + "loss": 0.8232, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5440976562417423, + "learning_rate": 0.00019691989991996663, + "loss": 0.7857, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.43250714012844876, + "learning_rate": 0.0001967909235273549, + "loss": 0.8152, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.4318040586865442, + "learning_rate": 0.00019665934584841682, + "loss": 0.7289, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.4991345592151983, + "learning_rate": 0.00019652517041934356, + "loss": 0.841, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4307284725126805, + "learning_rate": 0.00019638840084614182, + "loss": 0.7983, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.44595098079392037, + "learning_rate": 0.00019624904080453655, + "loss": 0.729, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.444848102324931, + "learning_rate": 0.00019610709403987246, + "loss": 0.7942, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5384661547454465, + "learning_rate": 0.00019596256436701324, + "loss": 0.8777, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.44493622975189756, + "learning_rate": 0.000195815455670239, + "loss": 0.8105, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5191835461601148, + "learning_rate": 0.00019566577190314197, + "loss": 0.7933, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.7669217869392135, + "learning_rate": 0.0001955135170885202, + "loss": 0.8539, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4596224083145755, + "learning_rate": 0.00019535869531826937, + "loss": 0.7398, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5209105604040403, + "learning_rate": 0.00019520131075327298, + "loss": 0.788, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.4603712507829723, + "learning_rate": 0.00019504136762329047, + "loss": 0.8453, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.49223156237577387, + "learning_rate": 0.00019487887022684336, + "loss": 0.7523, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4432794323191188, + "learning_rate": 0.00019471382293110003, + "loss": 0.8132, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4525001881259806, + "learning_rate": 0.00019454623017175812, + "loss": 0.8685, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.6363290906273378, + "learning_rate": 0.00019437609645292546, + "loss": 0.917, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.4221075035092066, + "learning_rate": 0.0001942034263469989, + "loss": 0.7898, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.45030339100473527, + "learning_rate": 0.00019402822449454153, + "loss": 0.8956, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5026018722226013, + "learning_rate": 0.00019385049560415794, + "loss": 0.8095, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4773694774912286, + "learning_rate": 0.00019367024445236754, + "loss": 0.8173, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4132220827826047, + "learning_rate": 0.00019348747588347637, + "loss": 0.8141, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.508349523133824, + "learning_rate": 0.00019330219480944694, + "loss": 0.8434, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.45444478321210935, + "learning_rate": 0.00019311440620976597, + "loss": 0.7478, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4055452465399414, + "learning_rate": 0.0001929241151313108, + "loss": 0.7595, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.4802390536211738, + "learning_rate": 0.00019273132668821364, + "loss": 0.7917, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4611572439999133, + "learning_rate": 0.00019253604606172417, + "loss": 0.804, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.5458792676014842, + "learning_rate": 0.00019233827850007027, + "loss": 0.8774, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.41183351102035237, + "learning_rate": 0.00019213802931831696, + "loss": 0.7337, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.5010290726400706, + "learning_rate": 0.00019193530389822363, + "loss": 0.8712, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5350843332707702, + "learning_rate": 0.00019173010768809933, + "loss": 0.882, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4810964147793547, + "learning_rate": 0.0001915224462026563, + "loss": 0.8291, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.42749800071113353, + "learning_rate": 0.00019131232502286188, + "loss": 0.77, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.48480999886839776, + "learning_rate": 0.0001910997497957885, + "loss": 0.8878, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4279853264104187, + "learning_rate": 0.00019088472623446183, + "loss": 0.811, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.4649343861345453, + "learning_rate": 0.00019066726011770726, + "loss": 0.8168, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4587482438792974, + "learning_rate": 0.0001904473572899947, + "loss": 0.8322, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.45474669553836844, + "learning_rate": 0.00019022502366128135, + "loss": 0.7818, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4970454376768652, + "learning_rate": 0.00019000026520685302, + "loss": 0.7589, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.5520040221462187, + "learning_rate": 0.0001897730879671634, + "loss": 0.9409, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4346714649503268, + "learning_rate": 0.00018954349804767184, + "loss": 0.7059, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.40193691976371565, + "learning_rate": 0.00018931150161867916, + "loss": 0.7715, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.46150868183579813, + "learning_rate": 0.00018907710491516199, + "loss": 0.8476, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.5399890321178172, + "learning_rate": 0.0001888403142366049, + "loss": 0.8239, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.43594284248019927, + "learning_rate": 0.00018860113594683148, + "loss": 0.7405, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.42116124999808713, + "learning_rate": 0.00018835957647383303, + "loss": 0.7814, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.42252061052504514, + "learning_rate": 0.00018811564230959588, + "loss": 0.8218, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.50537647594222, + "learning_rate": 0.00018786934000992688, + "loss": 0.7371, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4369670225061863, + "learning_rate": 0.00018762067619427746, + "loss": 0.7448, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.43656741472487437, + "learning_rate": 0.00018736965754556528, + "loss": 0.7608, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4860084003029073, + "learning_rate": 0.00018711629080999504, + "loss": 0.7349, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4580469630955301, + "learning_rate": 0.00018686058279687698, + "loss": 0.7933, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.48687340458779005, + "learning_rate": 0.00018660254037844388, + "loss": 0.777, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4045817052852119, + "learning_rate": 0.00018634217048966637, + "loss": 0.7157, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5247889494958011, + "learning_rate": 0.0001860794801280666, + "loss": 0.7962, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.4530743148650035, + "learning_rate": 0.0001858144763535302, + "loss": 0.7513, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.43938458385833445, + "learning_rate": 0.0001855471662881164, + "loss": 0.8049, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.5675799632624358, + "learning_rate": 0.00018527755711586678, + "loss": 0.878, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.41877422216064497, + "learning_rate": 0.00018500565608261214, + "loss": 0.7406, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.444603109052764, + "learning_rate": 0.00018473147049577774, + "loss": 0.7418, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.46843700148456774, + "learning_rate": 0.00018445500772418697, + "loss": 0.836, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.46480591805152116, + "learning_rate": 0.00018417627519786315, + "loss": 0.8868, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.4519091508052084, + "learning_rate": 0.00018389528040783012, + "loss": 0.8586, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.4805941763013618, + "learning_rate": 0.00018361203090591071, + "loss": 0.7041, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.46794476760137654, + "learning_rate": 0.00018332653430452376, + "loss": 0.8415, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4542489214214423, + "learning_rate": 0.00018303879827647975, + "loss": 0.8097, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.41426350430862596, + "learning_rate": 0.00018274883055477436, + "loss": 0.7327, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.5170816909021524, + "learning_rate": 0.00018245663893238075, + "loss": 0.7929, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.45509561524075265, + "learning_rate": 0.00018216223126204007, + "loss": 0.7645, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.6019848514623966, + "learning_rate": 0.00018186561545605054, + "loss": 0.923, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4195549964082855, + "learning_rate": 0.00018156679948605467, + "loss": 0.7057, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.39764833596753507, + "learning_rate": 0.00018126579138282503, + "loss": 0.7603, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.44713421891574273, + "learning_rate": 0.0001809625992360485, + "loss": 0.7406, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4521681645533832, + "learning_rate": 0.00018065723119410884, + "loss": 0.8014, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.407900068696158, + "learning_rate": 0.00018034969546386757, + "loss": 0.7613, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.39477291753373855, + "learning_rate": 0.0001800400003104436, + "loss": 0.7705, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.43973864277854025, + "learning_rate": 0.00017972815405699103, + "loss": 0.8151, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.4410872672283872, + "learning_rate": 0.00017941416508447536, + "loss": 0.7594, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4513826551120051, + "learning_rate": 0.0001790980418314484, + "loss": 0.7791, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.3860701722288845, + "learning_rate": 0.00017877979279382135, + "loss": 0.7573, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.39793214731682597, + "learning_rate": 0.0001784594265246366, + "loss": 0.7646, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4841101955135524, + "learning_rate": 0.0001781369516338378, + "loss": 0.7944, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.45124038050913845, + "learning_rate": 0.00017781237678803847, + "loss": 0.7447, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4891956717263449, + "learning_rate": 0.000177485710710289, + "loss": 0.8382, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5936939379837328, + "learning_rate": 0.00017715696217984235, + "loss": 0.8837, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.4342447571796585, + "learning_rate": 0.00017682614003191807, + "loss": 0.8167, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4422936157743359, + "learning_rate": 0.00017649325315746478, + "loss": 0.8069, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.4886946451030065, + "learning_rate": 0.0001761583105029213, + "loss": 0.7844, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4878581557072929, + "learning_rate": 0.00017582132106997616, + "loss": 0.8221, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4571092467722902, + "learning_rate": 0.00017548229391532572, + "loss": 0.7374, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4640542504506806, + "learning_rate": 0.00017514123815043074, + "loss": 0.7281, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.45082713250627315, + "learning_rate": 0.00017479816294127152, + "loss": 0.81, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.44702434061911905, + "learning_rate": 0.0001744530775081015, + "loss": 0.7366, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.42471669925252264, + "learning_rate": 0.0001741059911251997, + "loss": 0.7359, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.38147624028811195, + "learning_rate": 0.000173756913120621, + "loss": 0.7447, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.46598752304955565, + "learning_rate": 0.00017340585287594604, + "loss": 0.7732, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.6092979174633653, + "learning_rate": 0.0001730528198260285, + "loss": 0.7664, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.3820648925930006, + "learning_rate": 0.00017269782345874203, + "loss": 0.7802, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.42073776928733125, + "learning_rate": 0.00017234087331472497, + "loss": 0.7872, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4407185993406487, + "learning_rate": 0.00017198197898712404, + "loss": 0.7418, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.45039700269020816, + "learning_rate": 0.00017162115012133643, + "loss": 0.7387, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5215052876963912, + "learning_rate": 0.00017125839641475072, + "loss": 0.8291, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.4654364806533688, + "learning_rate": 0.00017089372761648616, + "loss": 0.784, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4429785841820516, + "learning_rate": 0.00017052715352713075, + "loss": 0.7175, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.40214111772244654, + "learning_rate": 0.00017015868399847768, + "loss": 0.709, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4399252399588669, + "learning_rate": 0.00016978832893326074, + "loss": 0.7974, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.45046986898972635, + "learning_rate": 0.00016941609828488807, + "loss": 0.7889, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.41692348879052327, + "learning_rate": 0.0001690420020571747, + "loss": 0.726, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4141308420160933, + "learning_rate": 0.0001686660503040737, + "loss": 0.7045, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.43243963865867097, + "learning_rate": 0.00016828825312940592, + "loss": 0.7903, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.44779441827969435, + "learning_rate": 0.0001679086206865886, + "loss": 0.7839, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4408349153103571, + "learning_rate": 0.00016752716317836229, + "loss": 0.778, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.4601615754810862, + "learning_rate": 0.0001671438908565167, + "loss": 0.7473, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4743084256987827, + "learning_rate": 0.00016675881402161536, + "loss": 0.813, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5095671649899296, + "learning_rate": 0.0001663719430227186, + "loss": 0.7942, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.43686089823258445, + "learning_rate": 0.00016598328825710533, + "loss": 0.7383, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4496164103862414, + "learning_rate": 0.000165592860169994, + "loss": 0.7573, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.4173129820413503, + "learning_rate": 0.00016520066925426144, + "loss": 0.7601, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.5030615513929414, + "learning_rate": 0.0001648067260501611, + "loss": 0.8161, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4373938776604511, + "learning_rate": 0.0001644110411450398, + "loss": 0.832, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.44732892527419604, + "learning_rate": 0.00016401362517305296, + "loss": 0.7348, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4441106926184888, + "learning_rate": 0.00016361448881487914, + "loss": 0.7854, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.39418950871040026, + "learning_rate": 0.00016321364279743266, + "loss": 0.7245, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4174735790493811, + "learning_rate": 0.0001628110978935756, + "loss": 0.7297, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4376586104045421, + "learning_rate": 0.00016240686492182804, + "loss": 0.7926, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.3934075321711763, + "learning_rate": 0.00016200095474607753, + "loss": 0.7813, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3872906434772159, + "learning_rate": 0.00016159337827528685, + "loss": 0.7568, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.5199897600647463, + "learning_rate": 0.0001611841464632011, + "loss": 0.7671, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4333691356437578, + "learning_rate": 0.0001607732703080532, + "loss": 0.7723, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.4309426916565077, + "learning_rate": 0.00016036076085226814, + "loss": 0.7338, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4685066334457469, + "learning_rate": 0.0001599466291821666, + "loss": 0.7522, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.42189623934554843, + "learning_rate": 0.0001595308864276666, + "loss": 0.7371, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.4001192071128831, + "learning_rate": 0.0001591135437619847, + "loss": 0.7481, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4202041426268543, + "learning_rate": 0.0001586946124013354, + "loss": 0.771, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3901188956330055, + "learning_rate": 0.0001582741036046301, + "loss": 0.7511, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.41370302982075036, + "learning_rate": 0.00015785202867317407, + "loss": 0.7514, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4756093791617333, + "learning_rate": 0.00015742839895036305, + "loss": 0.7016, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.4551131144843958, + "learning_rate": 0.00015700322582137827, + "loss": 0.7769, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3996874527215859, + "learning_rate": 0.0001565765207128805, + "loss": 0.7532, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.43418596236815815, + "learning_rate": 0.0001561482950927029, + "loss": 0.7841, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.4435909439680395, + "learning_rate": 0.00015571856046954285, + "loss": 0.7421, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4108923948917151, + "learning_rate": 0.00015528732839265272, + "loss": 0.7479, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.41523543356619474, + "learning_rate": 0.0001548546104515294, + "loss": 0.7494, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.42501622202702494, + "learning_rate": 0.00015442041827560274, + "loss": 0.7517, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.38516257244121765, + "learning_rate": 0.00015398476353392323, + "loss": 0.7106, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.3799701139245965, + "learning_rate": 0.00015354765793484834, + "loss": 0.7617, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.448632288852654, + "learning_rate": 0.00015310911322572753, + "loss": 0.8401, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.45846707413489796, + "learning_rate": 0.000152669141192587, + "loss": 0.7637, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.44923508813154894, + "learning_rate": 0.00015222775365981273, + "loss": 0.7353, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.40497548577982245, + "learning_rate": 0.00015178496248983254, + "loss": 0.8084, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.45047625376539696, + "learning_rate": 0.00015134077958279765, + "loss": 0.7884, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4339580624173808, + "learning_rate": 0.00015089521687626243, + "loss": 0.7548, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.3828440757921989, + "learning_rate": 0.000150448286344864, + "loss": 0.8069, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4369514957845695, + "learning_rate": 0.00015000000000000001, + "loss": 0.8094, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4575090185201534, + "learning_rate": 0.00014955036988950618, + "loss": 0.776, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4007444086963087, + "learning_rate": 0.00014909940809733222, + "loss": 0.7568, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4223989380201151, + "learning_rate": 0.00014864712674321734, + "loss": 0.742, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.4622068135988167, + "learning_rate": 0.00014819353798236427, + "loss": 0.7765, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4417443583939549, + "learning_rate": 0.00014773865400511272, + "loss": 0.7712, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.45797968462019545, + "learning_rate": 0.00014728248703661182, + "loss": 0.6998, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.45427742761700235, + "learning_rate": 0.00014682504933649144, + "loss": 0.7705, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.43344392615771654, + "learning_rate": 0.00014636635319853275, + "loss": 0.7197, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.43766179724002063, + "learning_rate": 0.00014590641095033787, + "loss": 0.7795, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.4063464894928757, + "learning_rate": 0.00014544523495299842, + "loss": 0.7191, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.48729880434325035, + "learning_rate": 0.0001449828376007636, + "loss": 0.8138, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.43289463621923896, + "learning_rate": 0.0001445192313207067, + "loss": 0.801, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4578743285606194, + "learning_rate": 0.0001440544285723915, + "loss": 0.8091, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.4401419700620134, + "learning_rate": 0.00014358844184753712, + "loss": 0.7844, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.38232239438419785, + "learning_rate": 0.00014312128366968243, + "loss": 0.7182, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.4780141596008564, + "learning_rate": 0.00014265296659384956, + "loss": 0.754, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.5261301180906734, + "learning_rate": 0.00014218350320620624, + "loss": 0.7783, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.3953958826213999, + "learning_rate": 0.0001417129061237278, + "loss": 0.7113, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.4261184843469951, + "learning_rate": 0.00014124118799385796, + "loss": 0.7738, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.3876048475761003, + "learning_rate": 0.00014076836149416887, + "loss": 0.7446, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.563293848910372, + "learning_rate": 0.0001402944393320206, + "loss": 0.7773, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.49597633949832215, + "learning_rate": 0.00013981943424421932, + "loss": 0.8199, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.40309914385407053, + "learning_rate": 0.00013934335899667527, + "loss": 0.7076, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.43205819932287276, + "learning_rate": 0.00013886622638405952, + "loss": 0.7428, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4546813831054762, + "learning_rate": 0.00013838804922946027, + "loss": 0.7618, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.43547603025942927, + "learning_rate": 0.00013790884038403795, + "loss": 0.7604, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4393298509391637, + "learning_rate": 0.00013742861272668012, + "loss": 0.7502, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.3922378692984408, + "learning_rate": 0.00013694737916365517, + "loss": 0.7169, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.4467339195309958, + "learning_rate": 0.00013646515262826552, + "loss": 0.7653, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.46472235365699377, + "learning_rate": 0.0001359819460805001, + "loss": 0.7928, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4158360910832021, + "learning_rate": 0.0001354977725066859, + "loss": 0.7727, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4241191437999942, + "learning_rate": 0.00013501264491913906, + "loss": 0.7233, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.45320813593008313, + "learning_rate": 0.0001345265763558152, + "loss": 0.7647, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.5029825382504564, + "learning_rate": 0.00013403957987995882, + "loss": 0.7287, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4030933346105237, + "learning_rate": 0.0001335516685797525, + "loss": 0.7515, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.40665107430457303, + "learning_rate": 0.00013306285556796495, + "loss": 0.6766, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4285404381640957, + "learning_rate": 0.00013257315398159864, + "loss": 0.6983, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.49332096614964, + "learning_rate": 0.00013208257698153677, + "loss": 0.7526, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.4404654409534716, + "learning_rate": 0.00013159113775218964, + "loss": 0.7036, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.468121984035523, + "learning_rate": 0.00013109884950114007, + "loss": 0.7263, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3844639852471911, + "learning_rate": 0.00013060572545878875, + "loss": 0.7058, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.44008454765882693, + "learning_rate": 0.00013011177887799845, + "loss": 0.7427, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.41257510588456725, + "learning_rate": 0.00012961702303373795, + "loss": 0.7404, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.4173626365969376, + "learning_rate": 0.00012912147122272523, + "loss": 0.7513, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4203051905670372, + "learning_rate": 0.00012862513676307008, + "loss": 0.7553, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.4113015838169403, + "learning_rate": 0.00012812803299391628, + "loss": 0.7265, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.46800861215426764, + "learning_rate": 0.00012763017327508305, + "loss": 0.7643, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.4319840048370039, + "learning_rate": 0.0001271315709867059, + "loss": 0.7687, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.4124014086022483, + "learning_rate": 0.00012663223952887723, + "loss": 0.709, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.48343787736347105, + "learning_rate": 0.00012613219232128608, + "loss": 0.7625, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.48913043881970025, + "learning_rate": 0.00012563144280285741, + "loss": 0.7852, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.4253409113694964, + "learning_rate": 0.00012513000443139112, + "loss": 0.7754, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3938290515647137, + "learning_rate": 0.00012462789068320017, + "loss": 0.7756, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.43889257025048206, + "learning_rate": 0.00012412511505274844, + "loss": 0.7633, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.48717407281038577, + "learning_rate": 0.00012362169105228826, + "loss": 0.7194, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.41399581850806794, + "learning_rate": 0.000123117632211497, + "loss": 0.7448, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4897078239482613, + "learning_rate": 0.00012261295207711346, + "loss": 0.8479, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3964396201971349, + "learning_rate": 0.0001221076642125742, + "loss": 0.6876, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.5306089779892281, + "learning_rate": 0.00012160178219764837, + "loss": 0.7462, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.41523976943358804, + "learning_rate": 0.00012109531962807332, + "loss": 0.6936, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4443907381859673, + "learning_rate": 0.00012058829011518896, + "loss": 0.75, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.4260433683841681, + "learning_rate": 0.00012008070728557186, + "loss": 0.7724, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.39736758898373775, + "learning_rate": 0.00011957258478066931, + "loss": 0.7437, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.4184822449319086, + "learning_rate": 0.00011906393625643244, + "loss": 0.6769, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4062379305029076, + "learning_rate": 0.00011855477538294935, + "loss": 0.7204, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.3856949432268323, + "learning_rate": 0.00011804511584407763, + "loss": 0.7199, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4232113472632291, + "learning_rate": 0.00011753497133707679, + "loss": 0.7487, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4200357018800635, + "learning_rate": 0.00011702435557223987, + "loss": 0.7619, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.41314894755488163, + "learning_rate": 0.00011651328227252517, + "loss": 0.7396, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.46084270174612124, + "learning_rate": 0.00011600176517318741, + "loss": 0.7939, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.44597163842810134, + "learning_rate": 0.00011548981802140848, + "loss": 0.7792, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4202879770262735, + "learning_rate": 0.00011497745457592816, + "loss": 0.7937, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3956833731748927, + "learning_rate": 0.00011446468860667421, + "loss": 0.7087, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.5094714223654991, + "learning_rate": 0.00011395153389439233, + "loss": 0.8162, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4136715538531261, + "learning_rate": 0.00011343800423027582, + "loss": 0.7478, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3834657609673381, + "learning_rate": 0.0001129241134155949, + "loss": 0.7467, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3875899812227674, + "learning_rate": 0.00011240987526132594, + "loss": 0.7558, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.5135466082811067, + "learning_rate": 0.00011189530358778005, + "loss": 0.7627, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.473434708183077, + "learning_rate": 0.00011138041222423177, + "loss": 0.7802, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.494591052056208, + "learning_rate": 0.00011086521500854745, + "loss": 0.7858, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.38863767319710224, + "learning_rate": 0.00011034972578681338, + "loss": 0.6249, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.5301734291936457, + "learning_rate": 0.00010983395841296348, + "loss": 0.7486, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3877927979879035, + "learning_rate": 0.00010931792674840718, + "loss": 0.7305, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.4758897119272412, + "learning_rate": 0.00010880164466165674, + "loss": 0.7346, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3919966698163836, + "learning_rate": 0.00010828512602795462, + "loss": 0.7181, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.5234209788796143, + "learning_rate": 0.00010776838472890065, + "loss": 0.831, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4156827024701615, + "learning_rate": 0.00010725143465207867, + "loss": 0.762, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.4257502618224722, + "learning_rate": 0.00010673428969068364, + "loss": 0.6873, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.3978561500511223, + "learning_rate": 0.00010621696374314807, + "loss": 0.7542, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3641376062984227, + "learning_rate": 0.00010569947071276847, + "loss": 0.6929, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.40315525921519396, + "learning_rate": 0.00010518182450733186, + "loss": 0.7669, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.457976722456177, + "learning_rate": 0.00010466403903874176, + "loss": 0.8074, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3801884543600071, + "learning_rate": 0.00010414612822264455, + "loss": 0.7231, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.3619662965224284, + "learning_rate": 0.00010362810597805526, + "loss": 0.6541, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.46525565418240006, + "learning_rate": 0.0001031099862269837, + "loss": 0.7576, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.39408509550423587, + "learning_rate": 0.00010259178289406011, + "loss": 0.6595, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.46135368683694594, + "learning_rate": 0.00010207350990616107, + "loss": 0.6884, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.4455811109440872, + "learning_rate": 0.0001015551811920351, + "loss": 0.7517, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.3791900124265542, + "learning_rate": 0.00010103681068192845, + "loss": 0.6718, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.37039564009804155, + "learning_rate": 0.00010051841230721065, + "loss": 0.6653, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4378410753625376, + "learning_rate": 0.0001, + "loss": 0.7251, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3904870365245172, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6892, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.41294804069804675, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7245, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.39447807767931187, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6216, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3970473272293371, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6458, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.43504285900316786, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7541, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4312285737778755, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7436, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.48158541308310493, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7686, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.393596011504804, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7185, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.45026029078386076, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7472, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5049887414853992, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7649, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.41758997906643314, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7075, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.5536933675395049, + "learning_rate": 9.378303625685195e-05, + "loss": 0.711, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.3803886284428118, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6448, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.5064730268247098, + "learning_rate": 9.274856534792138e-05, + "loss": 0.8373, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.49463878600172456, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7634, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.43325806075114826, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6797, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3825765694737797, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6893, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.3904857528958253, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7152, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3694232971992164, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7213, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4774550504808462, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6972, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4308484251364807, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7624, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4311846684950199, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7753, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.38112940625024455, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7379, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.41182473120367014, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6582, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.4177078038129574, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6951, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.42872519101710654, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7062, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.3851660668178274, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7341, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.39399030180484224, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7355, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.41334658070908564, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7165, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.40102653640641306, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6956, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.35528223087853755, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6615, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.44321819856925604, + "learning_rate": 8.348671772747487e-05, + "loss": 0.8013, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.49562511557650124, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6924, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4549211444589244, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6976, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4017316409897397, + "learning_rate": 8.195488415592238e-05, + "loss": 0.741, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4255975709811976, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7585, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.4326808155140898, + "learning_rate": 8.093606374356759e-05, + "loss": 0.647, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.3730941650680891, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6648, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.35951044826380923, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6726, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4387350992288795, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7258, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.5194060478073095, + "learning_rate": 7.89046803719267e-05, + "loss": 0.723, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.527440204002076, + "learning_rate": 7.839821780235168e-05, + "loss": 0.7425, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.41191859370959466, + "learning_rate": 7.789233578742582e-05, + "loss": 0.702, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3956344631480436, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7361, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.4110903358914057, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6925, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4082118542755697, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6949, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.42478088922281826, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7249, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.4105158859597807, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7342, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.37409171209139597, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6593, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3549294808565276, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6655, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.3800051561278298, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6862, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.469817845398903, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6136, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.39381855810199085, + "learning_rate": 7.286842901329412e-05, + "loss": 0.782, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.37178563261704844, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6623, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.38961453445864674, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6454, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3965513908576212, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7514, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.45064363977375316, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7614, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.4060315144808456, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7224, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.42948802633016125, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7519, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4148025815142497, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6638, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.42792810627652117, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6934, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.542604491432513, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7424, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.38767158306259125, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7282, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3760890979170099, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7117, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.35373027523959183, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6741, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3988415205885269, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6828, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.36702684550040354, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6673, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.40084524155492024, + "learning_rate": 6.547342364418481e-05, + "loss": 0.712, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4455992821837806, + "learning_rate": 6.498735508086093e-05, + "loss": 0.8008, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.36874876966879794, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6666, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4072947971147104, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6708, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.44355072289605285, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7068, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.4018794218851429, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7042, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.43150510748828014, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7087, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.4109261083980075, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6695, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.38117951752542334, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6838, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4078045343855914, + "learning_rate": 6.113377361594049e-05, + "loss": 0.8116, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.3658698560410895, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6641, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.42265321140968654, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7147, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4207842507069505, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7368, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.44143686362239726, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7775, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4232521231647057, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6821, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.4082101853695771, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7055, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.37104748761470746, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7218, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.4465202504946128, + "learning_rate": 5.73470334061505e-05, + "loss": 0.641, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3548116300074221, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6334, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.3829669115824023, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.637, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.3945929599299863, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6556, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.47611041407388976, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7331, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5093088875165239, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7021, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.41670883842744694, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.755, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.38272799339439023, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6569, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.39320048501785004, + "learning_rate": 5.363364680146725e-05, + "loss": 0.687, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4897907430360386, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6742, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4211981946672176, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7364, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.42346315382486294, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7589, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4201031380746793, + "learning_rate": 5.180646201763577e-05, + "loss": 0.742, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.4367166279947478, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6513, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.533525415958094, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7052, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4331409525050753, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7169, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.41743183154937163, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7007, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.46284723769156155, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6717, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.4271272261398075, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6943, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.44590071641931517, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6587, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4080036879402065, + "learning_rate": 4.821503751016746e-05, + "loss": 0.691, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4304736034704596, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6721, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.4363828605874691, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7489, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.3724797976524618, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7403, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.4202245709216763, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6965, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5034098519742084, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7503, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.41678967174539944, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.704, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.40363765184253564, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7043, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.4330321553411917, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7131, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.47084542795952505, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7202, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.48829209103839877, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7266, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3697866460660551, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6773, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.35051950407660176, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.656, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.39675532074823067, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6889, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3695985738810603, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6935, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3928421642818454, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6362, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4103553180916162, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7086, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4635742963022557, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7242, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.4018941798692042, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6778, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3861659712887641, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7279, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3698439057070995, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6098, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5075796063308272, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7652, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.42888392159795086, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6949, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.4170296196556884, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7358, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.40511114793948655, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6859, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4332015020841273, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7006, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.37324263563892507, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6578, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.5006566374888571, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7386, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.4097760825770306, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6551, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3874633060404428, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6617, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.41740833056447024, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6585, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4059373537897428, + "learning_rate": 3.519327394983888e-05, + "loss": 0.7758, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4230459093313156, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7109, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.35983083399968596, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6781, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.3862347845991195, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6866, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4056112846174515, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7236, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.45792789391523614, + "learning_rate": 3.324118597838464e-05, + "loss": 0.666, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.48354667047008587, + "learning_rate": 3.285610914348332e-05, + "loss": 0.7148, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.3804556674907602, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6496, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4395721364200357, + "learning_rate": 3.209137931341143e-05, + "loss": 0.7119, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4762425500374446, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6955, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.6394644509549837, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7075, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.40462114697933416, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6967, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.4150022302268185, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6575, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.40195532472344037, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6472, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.41834785778762196, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7084, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.3777055438883092, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6986, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5231949258816744, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6752, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.4048637448045943, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6512, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.43304092226186003, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7268, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3830447693951589, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6747, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.39990984281664904, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6838, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.4551108198746976, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.64, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.40599131447436326, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6628, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.41692510773925323, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6515, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3896403328440383, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6815, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.4291388697518044, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7245, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4208608690176431, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7182, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.4323144941925956, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6819, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.44938435426259044, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7289, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.5054956090695101, + "learning_rate": 2.451770608467432e-05, + "loss": 0.7507, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4634209912186953, + "learning_rate": 2.417867893002387e-05, + "loss": 0.7356, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4323898363207525, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6807, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.42919666902122455, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6731, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.351617997178594, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6651, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4722377856228152, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6616, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.4252233519419007, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6734, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4154506646637047, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7253, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.40310174664215564, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6727, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4495369483718136, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.7257, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.42026969436548406, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6851, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.40616232046271444, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6279, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.3827209722482836, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6716, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.4118601937748562, + "learning_rate": 2.027184594300898e-05, + "loss": 0.69, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4397139733675419, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6872, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.37243775954787023, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.709, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4114136079364362, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6889, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.508150224886995, + "learning_rate": 1.903740076395151e-05, + "loss": 0.7401, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.4181625831105891, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6769, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4349990879895087, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.8027, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.3862195368376329, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6735, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.38251050049217683, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6535, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.4802151160705585, + "learning_rate": 1.754336106761927e-05, + "loss": 0.7028, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.43690518070600337, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.709, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.4638498877700915, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7093, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.41135108464616504, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6963, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.38004016685640524, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6639, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.49320068092481323, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.7185, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.4808864838448925, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.66, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.4326999840300077, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6572, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4459055186196011, + "learning_rate": 1.526852950422226e-05, + "loss": 0.7321, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.367808932618989, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6917, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.36685490738498533, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6452, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.3612154568659507, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6537, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.4086733796279474, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6707, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3939845914942694, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6971, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.5303334440248394, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.7362, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3859726418970462, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7093, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.35427488135764695, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6892, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4369389709783054, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6948, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.4117089123943465, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7062, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3532018953005476, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6542, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.47543943329516386, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7187, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.37992988360707813, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6526, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.3922241337332064, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6767, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.37884468931415927, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.704, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3806859376653051, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6992, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.5610152838461234, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7056, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.38028207171332495, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.5972, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.43887978891017, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7067, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.439136811373697, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7162, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.39613847015380077, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7303, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.42107938942900913, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6763, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.4530792767638597, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6976, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.41667850895366476, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7106, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.391418747510892, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6086, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.4360923992795412, + "learning_rate": 8.900250204211514e-06, + "loss": 0.722, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5008453135264018, + "learning_rate": 8.687674977138116e-06, + "loss": 0.693, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.427954695595485, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7155, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.42627186878443935, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6476, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.45305074552722496, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7455, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.43952552027728614, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6864, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.41860905053696146, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7371, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.4429300783648926, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7027, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.37197811285497284, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6302, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.41765937724539, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6232, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3800434334049614, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6631, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4492829376654766, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7038, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.37246083957263676, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6394, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.42616944527222084, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6429, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3943748861470032, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6851, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4100631296136774, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6418, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.393456098253909, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6699, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.4874209025333082, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6806, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.5599644755055251, + "learning_rate": 5.453769828241872e-06, + "loss": 0.7563, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.36657647567910884, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6408, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4017511875623015, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6812, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.5042786295890377, + "learning_rate": 4.95863237670956e-06, + "loss": 0.7726, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.4337272599736757, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6798, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.38384618798879977, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6932, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.4064222200337202, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6825, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.43759684106046515, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.74, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.5555321453003748, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6836, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.34867305100422014, + "learning_rate": 4.037435632986786e-06, + "loss": 0.683, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.402036471779896, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6272, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.42220162411989065, + "learning_rate": 3.750959195463466e-06, + "loss": 0.5967, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.37826036175258104, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6512, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.5766733350158354, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.73, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.4183193873217483, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7148, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3390489055854859, + "learning_rate": 3.209076472645112e-06, + "loss": 0.657, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.35672282263007266, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6687, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.4094248270745932, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6289, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.33325937099727404, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6354, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.38970910705502143, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6519, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.39123053488286713, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6528, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.37504990575037767, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6587, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.38612516322755125, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7059, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.7522975311950446, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7632, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.35756864831313834, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6543, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.395755270390985, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6614, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4119893007021874, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.645, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.35649575503231873, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6647, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3893059272331949, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6468, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.3649968056430924, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.5978, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.41255239145320716, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6514, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.40189145448990304, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6615, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.3775003757047657, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7187, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.376306862874467, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.663, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.49003531568940967, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.712, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4148530340083136, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6954, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.3630434995850313, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6404, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4403499211672423, + "learning_rate": 9.780089980330642e-07, + "loss": 0.7059, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.48847312558980904, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6619, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.49161581022992146, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7541, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3516916918805193, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6357, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5427759861095616, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6259, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.3685168817488535, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6587, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4160758218903834, + "learning_rate": 5.920169059947411e-07, + "loss": 0.7067, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.4053343246014304, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6882, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.35443403133857604, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7107, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.37317721270640875, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.7032, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3668062564893062, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6722, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3784881471364597, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.675, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.39109606865640284, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.7203, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.39833714198682935, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6752, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3939019094573661, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.64, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.43245303370695626, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6979, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3949825211480894, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6621, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.42296506088190255, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6012, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.44937484805390177, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.7867, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.4312758913114681, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6844, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4110098544270641, + "learning_rate": 6.583743778106887e-08, + "loss": 0.7399, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.41300146967284723, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6621, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.4152309631283235, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6686, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3682227316320983, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6659, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4004379938007532, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6414, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4114899285709478, + "learning_rate": 5.375026405352035e-09, + "loss": 0.593, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4332671285477825, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.7279, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.37424393127606453, + "learning_rate": 0.0, + "loss": 0.6289, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 547629306118144.0, + "train_loss": 0.7479768854141235, + "train_runtime": 9738.2912, + "train_samples_per_second": 1.027, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 547629306118144.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..229b618b1930d985a82c33272130f026ad31b7a3 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "gate_proj", + "down_proj", + "up_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d08664656326d9091f068152cd49de2b53865195 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bd07edd55e8f5e15997a3d00f4965a83f16b364373ccecf2cf89692973e5047 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..1684f6be61b650f482eade8930fa015247cbbe3e --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6400c7299a9addf5511fe535658efe82ea180b7f21b1da938b71c4109248476a +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d10bb89d76f26d90a18659b2ccc4097883586cee --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.8665415082895096, + "learning_rate": 2e-05, + "loss": 1.4176, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9361044587519807, + "learning_rate": 4e-05, + "loss": 1.4488, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8998720040196685, + "learning_rate": 6e-05, + "loss": 1.4943, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.8384372392831825, + "learning_rate": 8e-05, + "loss": 1.3387, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.8772044641808151, + "learning_rate": 0.0001, + "loss": 1.1949, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.4529967231099963, + "learning_rate": 0.00012, + "loss": 1.108, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.713539918838152, + "learning_rate": 0.00014, + "loss": 1.0233, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6172981911132707, + "learning_rate": 0.00016, + "loss": 1.0161, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.48289881998891115, + "learning_rate": 0.00018, + "loss": 0.9326, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.4543512003770902, + "learning_rate": 0.0002, + "loss": 0.9921, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.3828461222080745, + "learning_rate": 0.00019999458931878073, + "loss": 0.9254, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.39901807190795885, + "learning_rate": 0.0001999783578606323, + "loss": 0.8614, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4345341826900339, + "learning_rate": 0.00019995130738201966, + "loss": 0.9218, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.413160806313582, + "learning_rate": 0.0001999134408101731, + "loss": 0.8823, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.45495385381233294, + "learning_rate": 0.00019986476224277165, + "loss": 0.8646, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4281753953778973, + "learning_rate": 0.00019980527694749952, + "loss": 0.8808, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3994236722592952, + "learning_rate": 0.00019973499136147606, + "loss": 0.8752, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.39117226882021455, + "learning_rate": 0.0001996539130905593, + "loss": 0.8724, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3833908827431505, + "learning_rate": 0.0001995620509085228, + "loss": 0.8491, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.34870671989081414, + "learning_rate": 0.00019945941475610623, + "loss": 0.8571, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.41481670018385486, + "learning_rate": 0.0001993460157399396, + "loss": 0.9211, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.34126965482355054, + "learning_rate": 0.0001992218661313415, + "loss": 0.8642, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.3940371915698765, + "learning_rate": 0.00019908697936499103, + "loss": 0.8366, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.3415653902831056, + "learning_rate": 0.00019894137003747403, + "loss": 0.8105, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.3381889283103069, + "learning_rate": 0.00019878505390570362, + "loss": 0.8403, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.32834606662461213, + "learning_rate": 0.00019861804788521493, + "loss": 0.8287, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.37945938489812575, + "learning_rate": 0.00019844037004833473, + "loss": 0.8416, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.3318956662864662, + "learning_rate": 0.00019825203962222572, + "loss": 0.8376, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.39869959883085637, + "learning_rate": 0.0001980530769868059, + "loss": 0.8111, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.3698233300253536, + "learning_rate": 0.00019784350367254322, + "loss": 0.8455, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.33474133660285676, + "learning_rate": 0.0001976233423581255, + "loss": 0.8069, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3217236781724644, + "learning_rate": 0.0001973926168680066, + "loss": 0.7918, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.3684982859817603, + "learning_rate": 0.00019715135216982798, + "loss": 0.8004, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5220395178503056, + "learning_rate": 0.0001968995743717171, + "loss": 0.8073, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.32558380289166644, + "learning_rate": 0.00019663731071946206, + "loss": 0.7774, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.31661713173183603, + "learning_rate": 0.00019636458959356316, + "loss": 0.7671, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3298183068426966, + "learning_rate": 0.0001960814405061619, + "loss": 0.8325, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.31976726870616595, + "learning_rate": 0.00019578789409784727, + "loss": 0.7998, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3125134823296456, + "learning_rate": 0.00019548398213434007, + "loss": 0.7922, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.32774997145846696, + "learning_rate": 0.00019516973750305532, + "loss": 0.814, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3402682312947837, + "learning_rate": 0.00019484519420954354, + "loss": 0.781, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.3964859103107029, + "learning_rate": 0.00019451038737381077, + "loss": 0.8899, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.323683473038952, + "learning_rate": 0.00019416535322651818, + "loss": 0.8364, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3564126507612505, + "learning_rate": 0.00019381012910506146, + "loss": 0.805, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.32723558996618596, + "learning_rate": 0.00019344475344953012, + "loss": 0.821, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.30144686701598333, + "learning_rate": 0.00019306926579854821, + "loss": 0.7483, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3597561681974569, + "learning_rate": 0.00019268370678499533, + "loss": 0.7905, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.32634667281340346, + "learning_rate": 0.0001922881181316097, + "loss": 0.8048, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.36244033086415745, + "learning_rate": 0.00019188254264647337, + "loss": 0.8672, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.3311512438280912, + "learning_rate": 0.0001914670242183795, + "loss": 0.7923, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.35330775909171264, + "learning_rate": 0.0001910416078120832, + "loss": 0.844, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3289965565274748, + "learning_rate": 0.0001906063394634356, + "loss": 0.8188, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3427039009285118, + "learning_rate": 0.00019016126627440237, + "loss": 0.7692, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3430056329984371, + "learning_rate": 0.00018970643640796642, + "loss": 0.8109, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.37118704716300105, + "learning_rate": 0.000189241899082916, + "loss": 0.8017, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3725139392488711, + "learning_rate": 0.00018876770456851877, + "loss": 0.7789, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.30844351687662164, + "learning_rate": 0.0001882839041790818, + "loss": 0.7923, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.35305601337396303, + "learning_rate": 0.00018779055026839868, + "loss": 0.7358, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.30221905832667595, + "learning_rate": 0.00018728769622408423, + "loss": 0.7397, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.31222145864981415, + "learning_rate": 0.00018677539646179707, + "loss": 0.7755, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.30463210235187144, + "learning_rate": 0.00018625370641935129, + "loss": 0.7508, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.432146979637179, + "learning_rate": 0.00018572268255071718, + "loss": 0.7729, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.38203631210937444, + "learning_rate": 0.00018518238231991218, + "loss": 0.8021, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.322834544631692, + "learning_rate": 0.00018463286419478255, + "loss": 0.7791, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.34536867964345297, + "learning_rate": 0.00018407418764067627, + "loss": 0.8702, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3558562140413582, + "learning_rate": 0.00018350641311400812, + "loss": 0.7676, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.32140353205176714, + "learning_rate": 0.0001829296020557174, + "loss": 0.7624, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3496655157337084, + "learning_rate": 0.00018234381688461942, + "loss": 0.7679, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3514609843872738, + "learning_rate": 0.0001817491209906506, + "loss": 0.7999, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.324248969730902, + "learning_rate": 0.00018114557872800905, + "loss": 0.7395, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3167679143411775, + "learning_rate": 0.00018053325540819045, + "loss": 0.7693, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.297240390835269, + "learning_rate": 0.0001799122172929206, + "loss": 0.7796, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.31563529561124276, + "learning_rate": 0.00017928253158698473, + "loss": 0.759, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.2859278361998818, + "learning_rate": 0.0001786442664309554, + "loss": 0.7517, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.3320519979645269, + "learning_rate": 0.0001779974908938184, + "loss": 0.7611, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.40254238852087254, + "learning_rate": 0.0001773422749654988, + "loss": 0.8519, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.32251277688662094, + "learning_rate": 0.00017667868954928694, + "loss": 0.8, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.34644761845854694, + "learning_rate": 0.00017600680645416583, + "loss": 0.7932, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.34182589813646663, + "learning_rate": 0.00017532669838704035, + "loss": 0.7233, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.320009189583074, + "learning_rate": 0.00017463843894486937, + "loss": 0.7667, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.2848719281876875, + "learning_rate": 0.0001739421026067017, + "loss": 0.7308, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.31940884011749177, + "learning_rate": 0.00017323776472561627, + "loss": 0.7565, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3060784615024359, + "learning_rate": 0.00017252550152056795, + "loss": 0.7751, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.32952625214413267, + "learning_rate": 0.0001718053900681397, + "loss": 0.7266, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.34413779971624914, + "learning_rate": 0.00017107750829420176, + "loss": 0.7894, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3110744615798355, + "learning_rate": 0.00017034193496547902, + "loss": 0.7093, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.30408532243174596, + "learning_rate": 0.00016959874968102735, + "loss": 0.7801, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3211103718095818, + "learning_rate": 0.00016884803286362, + "loss": 0.7112, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3172095159212251, + "learning_rate": 0.00016808986575104465, + "loss": 0.7792, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.3362603280371758, + "learning_rate": 0.00016732433038731242, + "loss": 0.7528, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.34112146178712244, + "learning_rate": 0.0001665515096137797, + "loss": 0.7945, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.32228872212014087, + "learning_rate": 0.00016577148706018328, + "loss": 0.7355, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.34482854948240044, + "learning_rate": 0.00016498434713559088, + "loss": 0.7762, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3268373450822887, + "learning_rate": 0.00016419017501926656, + "loss": 0.7742, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.3124781713850874, + "learning_rate": 0.0001633890566514535, + "loss": 0.7461, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3160555408217458, + "learning_rate": 0.00016258107872407375, + "loss": 0.7547, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.28533410896916955, + "learning_rate": 0.0001617663286713474, + "loss": 0.761, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3466820081651341, + "learning_rate": 0.00016094489466033043, + "loss": 0.7622, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3170141736712857, + "learning_rate": 0.00016011686558137448, + "loss": 0.7368, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.32190854194201374, + "learning_rate": 0.0001592823310385073, + "loss": 0.7318, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3131113945513269, + "learning_rate": 0.0001584413813397364, + "loss": 0.7466, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.30220107219934655, + "learning_rate": 0.00015759410748727662, + "loss": 0.7164, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.30952899615755614, + "learning_rate": 0.00015674060116770236, + "loss": 0.7578, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3327699148221942, + "learning_rate": 0.00015588095474202595, + "loss": 0.7523, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.2989396100887833, + "learning_rate": 0.00015501526123570277, + "loss": 0.7417, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.30436033938363616, + "learning_rate": 0.00015414361432856475, + "loss": 0.7162, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.31857262780407486, + "learning_rate": 0.0001532661083446829, + "loss": 0.7948, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3446000281880277, + "learning_rate": 0.00015238283824216015, + "loss": 0.7482, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3318185244789637, + "learning_rate": 0.00015149389960285558, + "loss": 0.7877, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.3124578801605435, + "learning_rate": 0.00015059938862204127, + "loss": 0.776, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3222820758875199, + "learning_rate": 0.00014969940209799248, + "loss": 0.7861, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3357041321844993, + "learning_rate": 0.00014879403742151283, + "loss": 0.7441, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4364427407564158, + "learning_rate": 0.00014788339256539544, + "loss": 0.7673, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.33772379371505923, + "learning_rate": 0.0001469675660738206, + "loss": 0.7361, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.31324402461981987, + "learning_rate": 0.00014604665705169237, + "loss": 0.7438, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3284183728090651, + "learning_rate": 0.00014512076515391375, + "loss": 0.7591, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.3437955854964839, + "learning_rate": 0.00014418999057460276, + "loss": 0.8042, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.29966031920462866, + "learning_rate": 0.0001432544340362501, + "loss": 0.7445, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.372993751987034, + "learning_rate": 0.00014231419677881966, + "loss": 0.7565, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.2909779283567334, + "learning_rate": 0.00014136938054879283, + "loss": 0.7319, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3400025202175588, + "learning_rate": 0.00014042008758815818, + "loss": 0.7561, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.32075724784767795, + "learning_rate": 0.00013946642062334766, + "loss": 0.7587, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.31341798591594283, + "learning_rate": 0.00013850848285411994, + "loss": 0.7454, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.30536177734067066, + "learning_rate": 0.000137546377942393, + "loss": 0.7477, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.30946114337713626, + "learning_rate": 0.00013658021000102636, + "loss": 0.7363, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.35863090762013095, + "learning_rate": 0.00013561008358255468, + "loss": 0.7739, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3177347916404916, + "learning_rate": 0.00013463610366787392, + "loss": 0.7341, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3080407358241847, + "learning_rate": 0.00013365837565488064, + "loss": 0.7344, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3091175540692471, + "learning_rate": 0.0001326770053470668, + "loss": 0.6835, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3216071074288792, + "learning_rate": 0.0001316920989420703, + "loss": 0.7278, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.30907097439090114, + "learning_rate": 0.00013070376302018287, + "loss": 0.7087, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.29533175707516046, + "learning_rate": 0.00012971210453281674, + "loss": 0.7338, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.29859223953122266, + "learning_rate": 0.000128717230790931, + "loss": 0.7488, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3279670824045337, + "learning_rate": 0.00012771924945341906, + "loss": 0.7461, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.31258437498493835, + "learning_rate": 0.00012671826851545851, + "loss": 0.7352, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.35164392189882304, + "learning_rate": 0.0001257143962968246, + "loss": 0.7742, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3149668288035313, + "learning_rate": 0.00012470774143016853, + "loss": 0.768, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3599061987303891, + "learning_rate": 0.00012369841284926188, + "loss": 0.7325, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3306079740957748, + "learning_rate": 0.00012268651977720866, + "loss": 0.7917, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.34636813413125606, + "learning_rate": 0.00012167217171462566, + "loss": 0.7107, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3346405520896027, + "learning_rate": 0.0001206554784277931, + "loss": 0.7139, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.2928458590251085, + "learning_rate": 0.00011963654993677645, + "loss": 0.7521, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3109268449994832, + "learning_rate": 0.00011861549650352069, + "loss": 0.692, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.29609544220706363, + "learning_rate": 0.00011759242861991855, + "loss": 0.7294, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.30305951144341053, + "learning_rate": 0.00011656745699585371, + "loss": 0.7481, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.33434639252903553, + "learning_rate": 0.00011554069254722051, + "loss": 0.7815, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3004114971651247, + "learning_rate": 0.00011451224638392129, + "loss": 0.7467, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3429547108807699, + "learning_rate": 0.00011348222979784289, + "loss": 0.7806, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.282764607021962, + "learning_rate": 0.00011245075425081328, + "loss": 0.7453, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.3683700182658041, + "learning_rate": 0.00011141793136253986, + "loss": 0.7675, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3231498352348244, + "learning_rate": 0.0001103838728985307, + "loss": 0.7052, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.316548397742817, + "learning_rate": 0.000109348690758, + "loss": 0.7333, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3251703264413508, + "learning_rate": 0.00010831249696175918, + "loss": 0.7192, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3271354998875943, + "learning_rate": 0.0001072754036400944, + "loss": 0.7945, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.2969502110605835, + "learning_rate": 0.00010623752302063283, + "loss": 0.7196, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.2706588923709536, + "learning_rate": 0.00010519896741619803, + "loss": 0.7248, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.312629753367858, + "learning_rate": 0.00010415984921265609, + "loss": 0.7611, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.29415145442436164, + "learning_rate": 0.00010312028085675391, + "loss": 0.7013, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.29003725066831476, + "learning_rate": 0.00010208037484395114, + "loss": 0.6681, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.29780012282274543, + "learning_rate": 0.00010104024370624644, + "loss": 0.707, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.2930838441084976, + "learning_rate": 0.0001, + "loss": 0.692, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2830263152992799, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7016, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.2920987174195825, + "learning_rate": 9.791962515604887e-05, + "loss": 0.6349, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3383941646660978, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7447, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.3394665075780364, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7448, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.36044537057037745, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7565, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.34190965060795325, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7049, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3369093723997976, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7361, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.33384262876410487, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7205, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.27998672705210387, + "learning_rate": 9.065130924199998e-05, + "loss": 0.6973, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.30621372941067515, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7052, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3216249063829598, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7637, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.2932825307667918, + "learning_rate": 8.754924574918675e-05, + "loss": 0.6974, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.31375859729551603, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7034, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.29066252870835996, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7268, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.29867126390130283, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7029, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.2959064734553686, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7321, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3367281621157235, + "learning_rate": 8.240757138008149e-05, + "loss": 0.6896, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.30749267277288383, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7423, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.6004758823211203, + "learning_rate": 8.036345006322359e-05, + "loss": 0.656, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3164898094208934, + "learning_rate": 7.934452157220694e-05, + "loss": 0.6995, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3683208690733922, + "learning_rate": 7.832782828537437e-05, + "loss": 0.73, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3003501429522181, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7171, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3118763215594028, + "learning_rate": 7.630158715073813e-05, + "loss": 0.6925, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.3176693090329536, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7274, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.27688207991554936, + "learning_rate": 7.428560370317542e-05, + "loss": 0.6609, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3245637087268195, + "learning_rate": 7.328173148454151e-05, + "loss": 0.6508, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.2803661278936095, + "learning_rate": 7.228075054658096e-05, + "loss": 0.725, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.2963398491469185, + "learning_rate": 7.1282769209069e-05, + "loss": 0.6988, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.34156377497721035, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7411, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3047256704802507, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7101, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.36331590536488, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7214, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3412957404622521, + "learning_rate": 6.732299465293322e-05, + "loss": 0.72, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.2836530578324241, + "learning_rate": 6.63416243451194e-05, + "loss": 0.6792, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.2862139506723695, + "learning_rate": 6.536389633212609e-05, + "loss": 0.6922, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.29564303690434485, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7342, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3306938327020271, + "learning_rate": 6.341978999897365e-05, + "loss": 0.6893, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3019761746574809, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7106, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.27634579198568876, + "learning_rate": 6.149151714588009e-05, + "loss": 0.6771, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.28356734604700906, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7382, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.31036724528842696, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7287, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.32971015681074933, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7307, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.32640446318454774, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7161, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.2943147473660576, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.6397, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.2868426565380326, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.6502, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.36398789995494085, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7213, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.30021806944587576, + "learning_rate": 5.395334294830765e-05, + "loss": 0.7079, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3142832482478888, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.6839, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3321794459631511, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7524, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.30203696871415725, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.6999, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.31914058118188965, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7103, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.30496302518566537, + "learning_rate": 4.940061137795876e-05, + "loss": 0.6886, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.32059109058279794, + "learning_rate": 4.850610039714444e-05, + "loss": 0.6794, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.30870403266829066, + "learning_rate": 4.761716175783989e-05, + "loss": 0.6887, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.2940897712048129, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7463, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.33241866905107526, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7255, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.31263803463835743, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7073, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.32520913061756423, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7225, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.32569839011013324, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7089, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.27974184726308615, + "learning_rate": 4.240589251272342e-05, + "loss": 0.6762, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3004702382270151, + "learning_rate": 4.155861866026364e-05, + "loss": 0.6745, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.31632043613545113, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7191, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.29698328654448874, + "learning_rate": 3.988313441862553e-05, + "loss": 0.709, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.31943276650095204, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.6898, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.3190564551595284, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7239, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.31518897840817817, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7002, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3360925029469203, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7055, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.2995768437981988, + "learning_rate": 3.580982498073344e-05, + "loss": 0.6661, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.2946731621228957, + "learning_rate": 3.501565286440914e-05, + "loss": 0.721, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.2911787551058751, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7028, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.2984102430978739, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7129, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3799106909399135, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.6967, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.31291620109632406, + "learning_rate": 3.191013424895536e-05, + "loss": 0.6905, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4372365695739739, + "learning_rate": 3.115196713638e-05, + "loss": 0.7094, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.2921039485526144, + "learning_rate": 3.040125031897264e-05, + "loss": 0.6839, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3069506592386268, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.6824, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.2755230014315186, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6905, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3075196306711869, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.6977, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.28606807442592747, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.6891, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.29895449980208705, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.658, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.30074188335195584, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.6707, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3112086183650853, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7298, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3406386820425041, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.7152, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3565063375536763, + "learning_rate": 2.399319354583418e-05, + "loss": 0.7518, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.33774418398181755, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.6895, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3012525960263884, + "learning_rate": 2.265772503450122e-05, + "loss": 0.6709, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3349355781125803, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7059, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3183580630182977, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7041, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.27245920061131146, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.665, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.3081707374073957, + "learning_rate": 2.008778270707944e-05, + "loss": 0.6863, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3009755101021308, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7033, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3524705284560279, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7222, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3252227822817956, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7449, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3182706619088614, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6708, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.3378575366682444, + "learning_rate": 1.707039794428259e-05, + "loss": 0.7109, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.34979777520246824, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7123, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3107408989330393, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6951, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.35083458885711105, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.669, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.30289873995071404, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.719, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.27703663598852074, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.6579, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.2997192438597723, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.6932, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.37058705280304277, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7306, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.30789069679483144, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.6972, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.2839052945678128, + "learning_rate": 1.220944973160133e-05, + "loss": 0.6897, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.3060387262874097, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6945, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.28373072999622234, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.6981, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3470563517363565, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7103, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.2974310593458591, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.6625, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3113942036487974, + "learning_rate": 9.838733725597615e-06, + "loss": 0.732, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.31112002998100463, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6953, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.2885181475137938, + "learning_rate": 8.958392187916841e-06, + "loss": 0.6669, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.35733844424630434, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7176, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3064149217986143, + "learning_rate": 8.117457353526625e-06, + "loss": 0.6889, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3333949354787685, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7275, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.31134604025405793, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7276, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.2734743339726324, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6372, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.31277695488379526, + "learning_rate": 6.555246550469907e-06, + "loss": 0.6953, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.30762019412564784, + "learning_rate": 6.189870894938587e-06, + "loss": 0.6513, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.2902772414260849, + "learning_rate": 5.834646773481811e-06, + "loss": 0.6699, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.30134874847806303, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6849, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3390718969814782, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7058, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3199046583330719, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7397, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.29107381202022176, + "learning_rate": 4.516017865659949e-06, + "loss": 0.6921, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.321685007765904, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7163, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.29299845007697267, + "learning_rate": 3.918559493838114e-06, + "loss": 0.6925, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.2836179458349143, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.6154, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.33320743402639, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.6994, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.282111649725243, + "learning_rate": 3.100425628282899e-06, + "loss": 0.6906, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.2917477526657873, + "learning_rate": 2.848647830172024e-06, + "loss": 0.658, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.26813610476565086, + "learning_rate": 2.607383131993424e-06, + "loss": 0.6543, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.2893710331120229, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.6655, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3282785399000293, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7415, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.3914289158651545, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.664, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.2862620965554168, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.6638, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.27936560912003844, + "learning_rate": 1.559629951665298e-06, + "loss": 0.6317, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.29097538702075687, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.6657, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.28207822803976385, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6987, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3410965732242547, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7136, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3086319422171524, + "learning_rate": 9.130206350089765e-07, + "loss": 0.68, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.3496626009727568, + "learning_rate": 7.781338686584927e-07, + "loss": 0.721, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.36261055877706444, + "learning_rate": 6.539842600603918e-07, + "loss": 0.6409, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.2903284042216192, + "learning_rate": 5.405852438937764e-07, + "loss": 0.6874, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.2948685039371921, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7115, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.27829102739569256, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6984, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.28770393639606623, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7037, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.2873587388835803, + "learning_rate": 1.947230525005006e-07, + "loss": 0.6658, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3231525112021064, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.6904, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.32926237386900375, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7014, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3137732749319311, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7192, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.3129977563017675, + "learning_rate": 2.164213936770576e-08, + "loss": 0.6747, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3062371164589154, + "learning_rate": 5.410681219286673e-09, + "loss": 0.662, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.29590600103168474, + "learning_rate": 0.0, + "loss": 0.6665, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 796663880482816.0, + "train_loss": 0.7521186127112462, + "train_runtime": 9643.3372, + "train_samples_per_second": 1.037, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 796663880482816.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4d391007f29d780ba267573f8c39cc3ce5d1ab35 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..40fbf5934c32a8304eecc40f24d599c549e0a4f6 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80f39ec3b5906fd72d240a40dbf2aa29de89e8405d5be1eb3dd7fa3e93e096a1 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9e986407ba6a6e1b186fad3dd2e6b41959985be --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e653d5dcf7750dab6d448a93903f641d9915f3f808c87456068c49c58c954653 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7e2302618b3a5966b1ac501634b46ef880010661 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 1.1517286163583604, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.6687, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8784673231706809, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3376, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.855709195397984, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3421, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8545594441581159, + "learning_rate": 4.210526315789474e-05, + "loss": 1.3895, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.9596426719614186, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.3588, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7660632146845626, + "learning_rate": 6.31578947368421e-05, + "loss": 1.1282, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.9171820232586981, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1249, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 0.9058274429801229, + "learning_rate": 8.421052631578948e-05, + "loss": 1.0576, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.9248654267702412, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9737, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.717052598253239, + "learning_rate": 0.00010526315789473685, + "loss": 0.8998, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.8750729713384804, + "learning_rate": 0.00011578947368421053, + "loss": 0.9871, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7595872834902654, + "learning_rate": 0.0001263157894736842, + "loss": 0.9514, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6110120148789119, + "learning_rate": 0.0001368421052631579, + "loss": 0.9019, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6765927184994187, + "learning_rate": 0.00014736842105263158, + "loss": 0.9269, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.5259268527470042, + "learning_rate": 0.00015789473684210527, + "loss": 0.8948, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5181980571589642, + "learning_rate": 0.00016842105263157895, + "loss": 0.7939, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5405157764502099, + "learning_rate": 0.00017894736842105264, + "loss": 0.8628, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5241165122992703, + "learning_rate": 0.00018947368421052632, + "loss": 0.8811, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.6112460052726951, + "learning_rate": 0.0002, + "loss": 0.8613, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.4723168313134422, + "learning_rate": 0.00019999865623437013, + "loss": 0.7921, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5454638138288851, + "learning_rate": 0.00019999462497359466, + "loss": 0.8534, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5756397377455599, + "learning_rate": 0.00019998790632601496, + "loss": 0.9456, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5768150789347223, + "learning_rate": 0.0001999785004721968, + "loss": 0.838, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5889828445999925, + "learning_rate": 0.00019996640766492543, + "loss": 0.8393, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.5731511487094392, + "learning_rate": 0.00019995162822919883, + "loss": 0.9322, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4824620621492906, + "learning_rate": 0.00019993416256221895, + "loss": 0.8379, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4662326949499265, + "learning_rate": 0.00019991401113338104, + "loss": 0.8727, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.45039686785700206, + "learning_rate": 0.00019989117448426108, + "loss": 0.7945, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.4843308416362078, + "learning_rate": 0.00019986565322860115, + "loss": 0.9525, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.47706475655670366, + "learning_rate": 0.00019983744805229296, + "loss": 0.8715, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5573219523232408, + "learning_rate": 0.00019980655971335945, + "loss": 0.8525, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.43732275764080164, + "learning_rate": 0.00019977298904193437, + "loss": 0.8131, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5557734558864179, + "learning_rate": 0.00019973673694024, + "loss": 0.8559, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4752817029198449, + "learning_rate": 0.00019969780438256293, + "loss": 0.7842, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.48562282417593106, + "learning_rate": 0.0001996561924152278, + "loss": 0.7702, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4838085619463917, + "learning_rate": 0.0001996119021565693, + "loss": 0.8311, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.5665340239580058, + "learning_rate": 0.0001995649347969019, + "loss": 0.8535, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.45309016580075406, + "learning_rate": 0.00019951529159848805, + "loss": 0.8583, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.46852754945064595, + "learning_rate": 0.00019946297389550433, + "loss": 0.8407, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.4728208757643744, + "learning_rate": 0.00019940798309400526, + "loss": 0.8251, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.48544863508651903, + "learning_rate": 0.0001993503206718859, + "loss": 0.8656, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4518664862237965, + "learning_rate": 0.00019928998817884182, + "loss": 0.8666, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.44540747326702773, + "learning_rate": 0.00019922698723632767, + "loss": 0.8326, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.45806596449842496, + "learning_rate": 0.00019916131953751342, + "loss": 0.8779, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.5313631148317027, + "learning_rate": 0.00019909298684723904, + "loss": 0.891, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4518362124365464, + "learning_rate": 0.00019902199100196697, + "loss": 0.8882, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.3729338492969433, + "learning_rate": 0.00019894833390973266, + "loss": 0.6883, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4809884109220938, + "learning_rate": 0.00019887201755009357, + "loss": 0.8261, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5463583180814995, + "learning_rate": 0.0001987930439740757, + "loss": 0.8297, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.5172391859428617, + "learning_rate": 0.00019871141530411853, + "loss": 0.9155, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4003614858527851, + "learning_rate": 0.0001986271337340182, + "loss": 0.8124, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.40791569838786457, + "learning_rate": 0.00019854020152886814, + "loss": 0.8116, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.4405159119511458, + "learning_rate": 0.0001984506210249986, + "loss": 0.8345, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.42517174461685453, + "learning_rate": 0.00019835839462991361, + "loss": 0.7659, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.4460029687562156, + "learning_rate": 0.00019826352482222638, + "loss": 0.7865, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5226197779801623, + "learning_rate": 0.00019816601415159263, + "loss": 0.8699, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4937701360268512, + "learning_rate": 0.0001980658652386421, + "loss": 0.8438, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.38499805535185944, + "learning_rate": 0.00019796308077490817, + "loss": 0.705, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.3957225137739923, + "learning_rate": 0.00019785766352275542, + "loss": 0.7261, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.4436071355924252, + "learning_rate": 0.00019774961631530545, + "loss": 0.8062, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.5060143537952035, + "learning_rate": 0.00019763894205636072, + "loss": 0.8937, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4666310340240579, + "learning_rate": 0.00019752564372032657, + "loss": 0.8011, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.47516294685470856, + "learning_rate": 0.00019740972435213115, + "loss": 0.7643, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.41948296178257, + "learning_rate": 0.00019729118706714375, + "loss": 0.8029, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.9356137828629628, + "learning_rate": 0.00019717003505109095, + "loss": 0.8717, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5003178590315507, + "learning_rate": 0.00019704627155997108, + "loss": 0.8529, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5574950858380237, + "learning_rate": 0.00019691989991996663, + "loss": 0.7995, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.46550720319245187, + "learning_rate": 0.0001967909235273549, + "loss": 0.8016, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.47864652266516605, + "learning_rate": 0.00019665934584841682, + "loss": 0.7453, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.5942420710638296, + "learning_rate": 0.00019652517041934356, + "loss": 0.8745, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.42090932279804627, + "learning_rate": 0.00019638840084614182, + "loss": 0.7448, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5049145369376222, + "learning_rate": 0.00019624904080453655, + "loss": 0.8252, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.5792342825330761, + "learning_rate": 0.00019610709403987246, + "loss": 0.8019, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.44998541600410774, + "learning_rate": 0.00019596256436701324, + "loss": 0.7326, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.5365343085617726, + "learning_rate": 0.000195815455670239, + "loss": 0.7877, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5795955132721454, + "learning_rate": 0.00019566577190314197, + "loss": 0.8862, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4675448423250248, + "learning_rate": 0.0001955135170885202, + "loss": 0.8409, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4555860592918079, + "learning_rate": 0.00019535869531826937, + "loss": 0.7593, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.3889970617977604, + "learning_rate": 0.00019520131075327298, + "loss": 0.7132, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.49458897281250364, + "learning_rate": 0.00019504136762329047, + "loss": 0.823, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4715921148144629, + "learning_rate": 0.00019487887022684336, + "loss": 0.803, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5908116456128671, + "learning_rate": 0.00019471382293110003, + "loss": 0.9233, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4207991491358518, + "learning_rate": 0.00019454623017175812, + "loss": 0.8111, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4299783137076167, + "learning_rate": 0.00019437609645292546, + "loss": 0.7235, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.5059751236281711, + "learning_rate": 0.0001942034263469989, + "loss": 0.8008, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4622847840305097, + "learning_rate": 0.00019402822449454153, + "loss": 0.8108, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5036533790171484, + "learning_rate": 0.00019385049560415794, + "loss": 0.8419, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5170320322443288, + "learning_rate": 0.00019367024445236754, + "loss": 0.7704, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.45757400492822115, + "learning_rate": 0.00019348747588347637, + "loss": 0.7597, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.44937124185934485, + "learning_rate": 0.00019330219480944694, + "loss": 0.8607, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.42185510062315396, + "learning_rate": 0.00019311440620976597, + "loss": 0.7624, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4855170008842487, + "learning_rate": 0.0001929241151313108, + "loss": 0.8145, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.4009385360390872, + "learning_rate": 0.00019273132668821364, + "loss": 0.8596, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4509916168171082, + "learning_rate": 0.00019253604606172417, + "loss": 0.7805, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.4136792288096338, + "learning_rate": 0.00019233827850007027, + "loss": 0.75, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4085754322271624, + "learning_rate": 0.00019213802931831696, + "loss": 0.7948, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.5408603055037793, + "learning_rate": 0.00019193530389822363, + "loss": 0.8711, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4645924737906978, + "learning_rate": 0.00019173010768809933, + "loss": 0.8197, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.46107409721762693, + "learning_rate": 0.0001915224462026563, + "loss": 0.8253, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.42789001390976716, + "learning_rate": 0.00019131232502286188, + "loss": 0.781, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4273494191543858, + "learning_rate": 0.0001910997497957885, + "loss": 0.7498, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5073261389731233, + "learning_rate": 0.00019088472623446183, + "loss": 0.7952, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.499046365441512, + "learning_rate": 0.00019066726011770726, + "loss": 0.7693, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5124867039670303, + "learning_rate": 0.0001904473572899947, + "loss": 0.8373, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.48089266147860954, + "learning_rate": 0.00019022502366128135, + "loss": 0.8139, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.42551932506628354, + "learning_rate": 0.00019000026520685302, + "loss": 0.7936, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.44217323707415573, + "learning_rate": 0.0001897730879671634, + "loss": 0.8486, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4117639798788838, + "learning_rate": 0.00018954349804767184, + "loss": 0.8302, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4542704177111429, + "learning_rate": 0.00018931150161867916, + "loss": 0.7589, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.414461489037142, + "learning_rate": 0.00018907710491516199, + "loss": 0.8374, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4153843596764285, + "learning_rate": 0.0001888403142366049, + "loss": 0.7532, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4529042454732251, + "learning_rate": 0.00018860113594683148, + "loss": 0.8298, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.496025521210255, + "learning_rate": 0.00018835957647383303, + "loss": 0.8064, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.603587742296673, + "learning_rate": 0.00018811564230959588, + "loss": 0.7193, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.3972364915263813, + "learning_rate": 0.00018786934000992688, + "loss": 0.7279, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5480722321418081, + "learning_rate": 0.00018762067619427746, + "loss": 0.7431, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5253385197530391, + "learning_rate": 0.00018736965754556528, + "loss": 0.8351, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4463762105084938, + "learning_rate": 0.00018711629080999504, + "loss": 0.7938, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.5148792249433735, + "learning_rate": 0.00018686058279687698, + "loss": 0.7272, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.4338337418999126, + "learning_rate": 0.00018660254037844388, + "loss": 0.7846, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4448550964821197, + "learning_rate": 0.00018634217048966637, + "loss": 0.7662, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4249486343036595, + "learning_rate": 0.0001860794801280666, + "loss": 0.774, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.45294012807830697, + "learning_rate": 0.0001858144763535302, + "loss": 0.7175, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.45204011098160635, + "learning_rate": 0.0001855471662881164, + "loss": 0.8116, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.5161306527497446, + "learning_rate": 0.00018527755711586678, + "loss": 0.8453, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.47025758199126994, + "learning_rate": 0.00018500565608261214, + "loss": 0.8441, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.4283962123541972, + "learning_rate": 0.00018473147049577774, + "loss": 0.7591, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4955025052770196, + "learning_rate": 0.00018445500772418697, + "loss": 0.8125, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4227675300350439, + "learning_rate": 0.00018417627519786315, + "loss": 0.7694, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.48702055259309274, + "learning_rate": 0.00018389528040783012, + "loss": 0.7718, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5549231795337493, + "learning_rate": 0.00018361203090591071, + "loss": 0.8284, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.401594485185252, + "learning_rate": 0.00018332653430452376, + "loss": 0.7444, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.48059717657336937, + "learning_rate": 0.00018303879827647975, + "loss": 0.773, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.5327126518985816, + "learning_rate": 0.00018274883055477436, + "loss": 0.857, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.5284520114850126, + "learning_rate": 0.00018245663893238075, + "loss": 0.7846, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.41462380594660525, + "learning_rate": 0.00018216223126204007, + "loss": 0.7582, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.6397890378293987, + "learning_rate": 0.00018186561545605054, + "loss": 0.856, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.44298566983966703, + "learning_rate": 0.00018156679948605467, + "loss": 0.8, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4410871408286474, + "learning_rate": 0.00018126579138282503, + "loss": 0.8091, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.4386374969077178, + "learning_rate": 0.0001809625992360485, + "loss": 0.892, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.37786738170094414, + "learning_rate": 0.00018065723119410884, + "loss": 0.6855, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.48666482876630224, + "learning_rate": 0.00018034969546386757, + "loss": 0.811, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4580786753482874, + "learning_rate": 0.0001800400003104436, + "loss": 0.7638, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4573964941518131, + "learning_rate": 0.00017972815405699103, + "loss": 0.8771, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.5037931767744466, + "learning_rate": 0.00017941416508447536, + "loss": 0.8465, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.47089241182534414, + "learning_rate": 0.0001790980418314484, + "loss": 0.7532, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.4461847906516847, + "learning_rate": 0.00017877979279382135, + "loss": 0.7476, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5353422933484618, + "learning_rate": 0.0001784594265246366, + "loss": 0.863, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.447860371365661, + "learning_rate": 0.0001781369516338378, + "loss": 0.7757, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.40415595292378664, + "learning_rate": 0.00017781237678803847, + "loss": 0.8061, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4591679956087192, + "learning_rate": 0.000177485710710289, + "loss": 0.8716, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.470229114598814, + "learning_rate": 0.00017715696217984235, + "loss": 0.7366, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.3916378327664192, + "learning_rate": 0.00017682614003191807, + "loss": 0.8048, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4713115111294236, + "learning_rate": 0.00017649325315746478, + "loss": 0.7729, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.4041785964985653, + "learning_rate": 0.0001761583105029213, + "loss": 0.7474, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.48309315840712175, + "learning_rate": 0.00017582132106997616, + "loss": 0.7717, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.41339871995879307, + "learning_rate": 0.00017548229391532572, + "loss": 0.7527, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.45712444390966833, + "learning_rate": 0.00017514123815043074, + "loss": 0.7328, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.44446203830573594, + "learning_rate": 0.00017479816294127152, + "loss": 0.7934, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.3905715206203686, + "learning_rate": 0.0001744530775081015, + "loss": 0.7619, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5228376785239668, + "learning_rate": 0.0001741059911251997, + "loss": 0.8442, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.43727578992327143, + "learning_rate": 0.000173756913120621, + "loss": 0.8153, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.46439048899549623, + "learning_rate": 0.00017340585287594604, + "loss": 0.7802, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4268493145485142, + "learning_rate": 0.0001730528198260285, + "loss": 0.7251, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.4202729498940307, + "learning_rate": 0.00017269782345874203, + "loss": 0.7419, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.40547040665996986, + "learning_rate": 0.00017234087331472497, + "loss": 0.8219, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.40398281132481145, + "learning_rate": 0.00017198197898712404, + "loss": 0.7293, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4271714328860988, + "learning_rate": 0.00017162115012133643, + "loss": 0.8337, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.47281130970675284, + "learning_rate": 0.00017125839641475072, + "loss": 0.6725, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.5586025504356148, + "learning_rate": 0.00017089372761648616, + "loss": 0.7702, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.46412984129645973, + "learning_rate": 0.00017052715352713075, + "loss": 0.7665, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5354794076427467, + "learning_rate": 0.00017015868399847768, + "loss": 0.8185, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.47004583783231435, + "learning_rate": 0.00016978832893326074, + "loss": 0.756, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.45787096718164827, + "learning_rate": 0.00016941609828488807, + "loss": 0.7976, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.4170042103828585, + "learning_rate": 0.0001690420020571747, + "loss": 0.8544, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4236827748110516, + "learning_rate": 0.0001686660503040737, + "loss": 0.7225, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.48914571749208785, + "learning_rate": 0.00016828825312940592, + "loss": 0.8141, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.450665610890202, + "learning_rate": 0.0001679086206865886, + "loss": 0.8068, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.42246892048462525, + "learning_rate": 0.00016752716317836229, + "loss": 0.7663, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.4148639789300373, + "learning_rate": 0.0001671438908565167, + "loss": 0.7749, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.3979616117659437, + "learning_rate": 0.00016675881402161536, + "loss": 0.7507, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.40135218170704223, + "learning_rate": 0.0001663719430227186, + "loss": 0.7388, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.40606436029294757, + "learning_rate": 0.00016598328825710533, + "loss": 0.8059, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.43380889159568203, + "learning_rate": 0.000165592860169994, + "loss": 0.8396, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.4456063405476975, + "learning_rate": 0.00016520066925426144, + "loss": 0.752, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.5226308379446547, + "learning_rate": 0.0001648067260501611, + "loss": 0.7995, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4125504316119645, + "learning_rate": 0.0001644110411450398, + "loss": 0.7514, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.48192708168537784, + "learning_rate": 0.00016401362517305296, + "loss": 0.7362, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.46832378682779585, + "learning_rate": 0.00016361448881487914, + "loss": 0.7209, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.4853704734176963, + "learning_rate": 0.00016321364279743266, + "loss": 0.7761, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4835689387307517, + "learning_rate": 0.0001628110978935756, + "loss": 0.7957, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.42821584240297866, + "learning_rate": 0.00016240686492182804, + "loss": 0.769, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.5275649955045318, + "learning_rate": 0.00016200095474607753, + "loss": 0.7488, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.37896939611242514, + "learning_rate": 0.00016159337827528685, + "loss": 0.7908, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.5014224816022986, + "learning_rate": 0.0001611841464632011, + "loss": 0.8664, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4302596939232914, + "learning_rate": 0.0001607732703080532, + "loss": 0.7176, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.37601956090573124, + "learning_rate": 0.00016036076085226814, + "loss": 0.749, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4470864095317222, + "learning_rate": 0.0001599466291821666, + "loss": 0.7675, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.38373759587270195, + "learning_rate": 0.0001595308864276666, + "loss": 0.7436, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.40096002153413757, + "learning_rate": 0.0001591135437619847, + "loss": 0.7605, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.41497898478455786, + "learning_rate": 0.0001586946124013354, + "loss": 0.7357, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4627369083577583, + "learning_rate": 0.0001582741036046301, + "loss": 0.7713, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.43861119494110645, + "learning_rate": 0.00015785202867317407, + "loss": 0.7748, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4949192731053403, + "learning_rate": 0.00015742839895036305, + "loss": 0.8108, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.3892530496626969, + "learning_rate": 0.00015700322582137827, + "loss": 0.768, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3678042621978389, + "learning_rate": 0.0001565765207128805, + "loss": 0.7824, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4436263250027655, + "learning_rate": 0.0001561482950927029, + "loss": 0.7988, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.46979121158729337, + "learning_rate": 0.00015571856046954285, + "loss": 0.7931, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.42801092121142625, + "learning_rate": 0.00015528732839265272, + "loss": 0.79, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.39917120835121755, + "learning_rate": 0.0001548546104515294, + "loss": 0.7071, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5166072531894059, + "learning_rate": 0.00015442041827560274, + "loss": 0.829, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5034819696946465, + "learning_rate": 0.00015398476353392323, + "loss": 0.7299, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.394779804153212, + "learning_rate": 0.00015354765793484834, + "loss": 0.73, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.40026759449463856, + "learning_rate": 0.00015310911322572753, + "loss": 0.7356, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.38774116439929845, + "learning_rate": 0.000152669141192587, + "loss": 0.695, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4270754809977913, + "learning_rate": 0.00015222775365981273, + "loss": 0.7348, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4531749309334632, + "learning_rate": 0.00015178496248983254, + "loss": 0.7713, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4220126930063332, + "learning_rate": 0.00015134077958279765, + "loss": 0.8162, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.41189207346382134, + "learning_rate": 0.00015089521687626243, + "loss": 0.772, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.5698244118323691, + "learning_rate": 0.000150448286344864, + "loss": 0.7883, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4197411432424747, + "learning_rate": 0.00015000000000000001, + "loss": 0.7005, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3813200333236524, + "learning_rate": 0.00014955036988950618, + "loss": 0.736, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.42176921762452485, + "learning_rate": 0.00014909940809733222, + "loss": 0.7886, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4148893867152742, + "learning_rate": 0.00014864712674321734, + "loss": 0.7268, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.4420158919478724, + "learning_rate": 0.00014819353798236427, + "loss": 0.7338, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.42334138064913807, + "learning_rate": 0.00014773865400511272, + "loss": 0.7971, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.446143852873466, + "learning_rate": 0.00014728248703661182, + "loss": 0.7752, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.8084509813454605, + "learning_rate": 0.00014682504933649144, + "loss": 0.7829, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.39565668696801404, + "learning_rate": 0.00014636635319853275, + "loss": 0.733, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.4336069889808771, + "learning_rate": 0.00014590641095033787, + "loss": 0.7274, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.41970839219759815, + "learning_rate": 0.00014544523495299842, + "loss": 0.7383, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5212755834570003, + "learning_rate": 0.0001449828376007636, + "loss": 0.7914, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.4267944087662684, + "learning_rate": 0.0001445192313207067, + "loss": 0.7299, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.37292182944345237, + "learning_rate": 0.0001440544285723915, + "loss": 0.7079, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.4219115953204863, + "learning_rate": 0.00014358844184753712, + "loss": 0.7655, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5386473654464474, + "learning_rate": 0.00014312128366968243, + "loss": 0.8038, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.4967703925251337, + "learning_rate": 0.00014265296659384956, + "loss": 0.6674, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.47150459624335544, + "learning_rate": 0.00014218350320620624, + "loss": 0.717, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5304487190547554, + "learning_rate": 0.0001417129061237278, + "loss": 0.8314, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.4605647276190697, + "learning_rate": 0.00014124118799385796, + "loss": 0.7901, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.41255225424760517, + "learning_rate": 0.00014076836149416887, + "loss": 0.7681, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4649401369258046, + "learning_rate": 0.0001402944393320206, + "loss": 0.7607, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4110359298438914, + "learning_rate": 0.00013981943424421932, + "loss": 0.7164, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3691954624781045, + "learning_rate": 0.00013934335899667527, + "loss": 0.7297, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.4302804306045596, + "learning_rate": 0.00013886622638405952, + "loss": 0.7509, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3849755894161268, + "learning_rate": 0.00013838804922946027, + "loss": 0.7579, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4639946277185081, + "learning_rate": 0.00013790884038403795, + "loss": 0.7576, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.38537982413559235, + "learning_rate": 0.00013742861272668012, + "loss": 0.7242, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.4697285691125311, + "learning_rate": 0.00013694737916365517, + "loss": 0.7474, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.4255353982284123, + "learning_rate": 0.00013646515262826552, + "loss": 0.6864, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.5310855141742439, + "learning_rate": 0.0001359819460805001, + "loss": 0.7427, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.48918813587024496, + "learning_rate": 0.0001354977725066859, + "loss": 0.7555, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.5202997980251455, + "learning_rate": 0.00013501264491913906, + "loss": 0.7463, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3944315462290684, + "learning_rate": 0.0001345265763558152, + "loss": 0.7215, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.39552338258643244, + "learning_rate": 0.00013403957987995882, + "loss": 0.7065, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4582281893475035, + "learning_rate": 0.0001335516685797525, + "loss": 0.7638, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4073482695204598, + "learning_rate": 0.00013306285556796495, + "loss": 0.7283, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4064157378706338, + "learning_rate": 0.00013257315398159864, + "loss": 0.7208, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.4210844965228265, + "learning_rate": 0.00013208257698153677, + "loss": 0.7104, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.5699449595260611, + "learning_rate": 0.00013159113775218964, + "loss": 0.8177, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4795970472350375, + "learning_rate": 0.00013109884950114007, + "loss": 0.7426, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.44336877516816464, + "learning_rate": 0.00013060572545878875, + "loss": 0.7849, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4105266805460912, + "learning_rate": 0.00013011177887799845, + "loss": 0.7601, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.45130024637132543, + "learning_rate": 0.00012961702303373795, + "loss": 0.7319, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.4050160926419668, + "learning_rate": 0.00012912147122272523, + "loss": 0.8104, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.5343629673832331, + "learning_rate": 0.00012862513676307008, + "loss": 0.8162, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3656076310630895, + "learning_rate": 0.00012812803299391628, + "loss": 0.6918, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4494914220087584, + "learning_rate": 0.00012763017327508305, + "loss": 0.865, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.3700847529319172, + "learning_rate": 0.0001271315709867059, + "loss": 0.6773, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.43925354670515865, + "learning_rate": 0.00012663223952887723, + "loss": 0.7394, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.3716027483258037, + "learning_rate": 0.00012613219232128608, + "loss": 0.7134, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.387749521659969, + "learning_rate": 0.00012563144280285741, + "loss": 0.6993, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.4396050205960909, + "learning_rate": 0.00012513000443139112, + "loss": 0.6753, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3859151616960672, + "learning_rate": 0.00012462789068320017, + "loss": 0.6958, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.38685983405108443, + "learning_rate": 0.00012412511505274844, + "loss": 0.7733, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.477317920235237, + "learning_rate": 0.00012362169105228826, + "loss": 0.7834, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.4718679694581572, + "learning_rate": 0.000123117632211497, + "loss": 0.7171, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4443648359335215, + "learning_rate": 0.00012261295207711346, + "loss": 0.7157, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.46705387836789647, + "learning_rate": 0.0001221076642125742, + "loss": 0.7559, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.4324381846505133, + "learning_rate": 0.00012160178219764837, + "loss": 0.7417, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3907974055741124, + "learning_rate": 0.00012109531962807332, + "loss": 0.6932, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.41516503126378956, + "learning_rate": 0.00012058829011518896, + "loss": 0.7408, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.456137617001889, + "learning_rate": 0.00012008070728557186, + "loss": 0.7605, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.37970633126428244, + "learning_rate": 0.00011957258478066931, + "loss": 0.688, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.47216579659845764, + "learning_rate": 0.00011906393625643244, + "loss": 0.8465, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.46410904973970607, + "learning_rate": 0.00011855477538294935, + "loss": 0.7495, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.36707748031150655, + "learning_rate": 0.00011804511584407763, + "loss": 0.6868, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.40764819091821336, + "learning_rate": 0.00011753497133707679, + "loss": 0.7504, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4904339723714431, + "learning_rate": 0.00011702435557223987, + "loss": 0.8228, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.5845436331459538, + "learning_rate": 0.00011651328227252517, + "loss": 0.6908, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.41557578398633754, + "learning_rate": 0.00011600176517318741, + "loss": 0.7081, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.46555867308061016, + "learning_rate": 0.00011548981802140848, + "loss": 0.754, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4218961421632011, + "learning_rate": 0.00011497745457592816, + "loss": 0.7925, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4484719712824621, + "learning_rate": 0.00011446468860667421, + "loss": 0.764, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.4229767112102817, + "learning_rate": 0.00011395153389439233, + "loss": 0.716, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.39536664986377285, + "learning_rate": 0.00011343800423027582, + "loss": 0.7021, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4120722600837275, + "learning_rate": 0.0001129241134155949, + "loss": 0.7373, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4195788289720658, + "learning_rate": 0.00011240987526132594, + "loss": 0.7554, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4223949634083843, + "learning_rate": 0.00011189530358778005, + "loss": 0.7553, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.3558898854471773, + "learning_rate": 0.00011138041222423177, + "loss": 0.6695, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.4118916695983032, + "learning_rate": 0.00011086521500854745, + "loss": 0.7136, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4129608908896074, + "learning_rate": 0.00011034972578681338, + "loss": 0.7135, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.4002321698132183, + "learning_rate": 0.00010983395841296348, + "loss": 0.7304, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.43198375109351433, + "learning_rate": 0.00010931792674840718, + "loss": 0.814, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.3784203007384405, + "learning_rate": 0.00010880164466165674, + "loss": 0.6853, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3960736830144517, + "learning_rate": 0.00010828512602795462, + "loss": 0.6968, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.40835552170155603, + "learning_rate": 0.00010776838472890065, + "loss": 0.7495, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.439404146351219, + "learning_rate": 0.00010725143465207867, + "loss": 0.8095, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.420994128897334, + "learning_rate": 0.00010673428969068364, + "loss": 0.6639, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.39225139081963223, + "learning_rate": 0.00010621696374314807, + "loss": 0.7461, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.4485862323178119, + "learning_rate": 0.00010569947071276847, + "loss": 0.7733, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4077803252456207, + "learning_rate": 0.00010518182450733186, + "loss": 0.6944, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4432056065485272, + "learning_rate": 0.00010466403903874176, + "loss": 0.7279, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5329557517777089, + "learning_rate": 0.00010414612822264455, + "loss": 0.7954, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.4091730598792052, + "learning_rate": 0.00010362810597805526, + "loss": 0.7623, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4810112612191591, + "learning_rate": 0.0001031099862269837, + "loss": 0.7825, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.6365890894725469, + "learning_rate": 0.00010259178289406011, + "loss": 0.7943, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4111901388670336, + "learning_rate": 0.00010207350990616107, + "loss": 0.7424, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.38625652502904867, + "learning_rate": 0.0001015551811920351, + "loss": 0.727, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.39117761965345, + "learning_rate": 0.00010103681068192845, + "loss": 0.7207, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.44567761261716654, + "learning_rate": 0.00010051841230721065, + "loss": 0.706, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3842032489963624, + "learning_rate": 0.0001, + "loss": 0.7937, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.4005564577154571, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7397, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3686604462401117, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6783, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.4209904332384653, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6673, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6986202466742053, + "learning_rate": 9.792649009383899e-05, + "loss": 0.8607, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.41210849915293474, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7255, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3924116009259164, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7682, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.41531418086818506, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7064, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.5419711040812547, + "learning_rate": 9.585387177735547e-05, + "loss": 0.761, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.43484669445975627, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7297, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.37971662203710665, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7158, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.40222616165885955, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7072, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3908896147908082, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7098, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.4114180268402791, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7265, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4236879210610389, + "learning_rate": 9.274856534792138e-05, + "loss": 0.6556, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.4133354131488104, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6925, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.44023366180001644, + "learning_rate": 9.171487397204539e-05, + "loss": 0.707, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.39383222477658913, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7393, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.425900591805592, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6727, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.4410310504516789, + "learning_rate": 9.016604158703654e-05, + "loss": 0.786, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4089530094326561, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7154, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.39906684367445927, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6902, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3712457341250182, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6959, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.42932068292116693, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7888, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.44063700408114603, + "learning_rate": 8.759012473867407e-05, + "loss": 0.757, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3694752662614534, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6822, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4244994037695542, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7136, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.44566512429217003, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7368, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.4101734276384276, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7224, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3635602210799667, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6298, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.42048420552631677, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7032, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3908678730309028, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6514, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3779444987018777, + "learning_rate": 8.348671772747487e-05, + "loss": 0.667, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.3637402307722022, + "learning_rate": 8.297564442776014e-05, + "loss": 0.708, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3959290525281248, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6877, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.37007921797782506, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7668, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3743741676728308, + "learning_rate": 8.144522461705067e-05, + "loss": 0.749, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.3965351397620762, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7888, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.4101892634153247, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7952, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4251085569177034, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7083, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5056503331618741, + "learning_rate": 7.941170988481108e-05, + "loss": 0.8109, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.3982125186699654, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7216, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.39706312840546, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6526, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.3799045657114476, + "learning_rate": 7.789233578742582e-05, + "loss": 0.667, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4006507083721613, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6576, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.44500518273668843, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7827, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.38343395517520396, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7076, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4071088842442927, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7681, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.3527984613730762, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6879, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.4853538119951193, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7359, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4309492168388696, + "learning_rate": 7.43685571971426e-05, + "loss": 0.8066, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4391508861892157, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7025, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.42193316628985833, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7399, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.4234358749707796, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7547, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.38876566927042244, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7054, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4021044454948182, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7651, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.48667600007586975, + "learning_rate": 7.137486323692995e-05, + "loss": 0.749, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.38254117015863065, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6961, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.5928784195218275, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6982, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.4688124096954091, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6658, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.45186423242788126, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7428, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4462372246076824, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7112, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4111894574633264, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6578, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.39301804162906895, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6932, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.45009492097394654, + "learning_rate": 6.742684601840141e-05, + "loss": 0.677, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.457100909318662, + "learning_rate": 6.693714443203507e-05, + "loss": 0.759, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.37777549149897655, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7813, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.3641882091953124, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6425, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.3944714771721567, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7111, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.44347824782901235, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6993, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4252854974071172, + "learning_rate": 6.450222749331414e-05, + "loss": 0.719, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.37910642726706467, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7513, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.40388732183288534, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6867, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.40774502665216267, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6722, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3565949655526324, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6753, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.40349754342358957, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6782, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.39411785907761565, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7194, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.3422061634288728, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6673, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.3806686347878848, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6397, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4265471840720824, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7629, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.42451095650718557, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7231, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3763223634525672, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6872, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.43155190822128586, + "learning_rate": 5.875881200614207e-05, + "loss": 0.7582, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.43223385365559036, + "learning_rate": 5.828709387627218e-05, + "loss": 0.743, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.41243673230459976, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6332, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.5021015889587451, + "learning_rate": 5.73470334061505e-05, + "loss": 0.704, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4223139997737187, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7268, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5082179430384259, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7041, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.3647414345693038, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6506, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.3619076769703431, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6791, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.41988404271514806, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7256, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.42540111458791796, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7357, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.3883567454520023, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6199, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.5739204044719651, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7103, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3814798108411745, + "learning_rate": 5.31749506635086e-05, + "loss": 0.655, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4179213286850888, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6828, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.40706861671193456, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7264, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4135502510054031, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7394, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.37420008958601003, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7216, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4236759742172755, + "learning_rate": 5.090059190266779e-05, + "loss": 0.693, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.36869906654334067, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6768, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.43738746207366114, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7576, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.37688900548958404, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7024, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.4817127626688613, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6916, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.463120010250976, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7071, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.40705314884824917, + "learning_rate": 4.821503751016746e-05, + "loss": 0.647, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.41894699513477546, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6749, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.41486614034921926, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6849, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.3987288082147527, + "learning_rate": 4.689088677427249e-05, + "loss": 0.727, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.40693878439217374, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7404, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.48122356069610334, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7152, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.4387112264116535, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6811, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4329919657974128, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7011, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.39117666474117113, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7468, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.37539949340381906, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6666, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4141843669969637, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7111, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.465097596409026, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6923, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.42128752267401115, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6771, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.44301026272872385, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6614, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.393343927430952, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7273, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.356091850538638, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6423, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.38892981580001185, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7036, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4270517879334578, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7552, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.3487738626170391, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6323, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3582878418210738, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6802, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3927117232498071, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7313, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3771596287275359, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6708, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3887304108114345, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7294, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.41132308825366887, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6993, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.5604341510454696, + "learning_rate": 3.79990452539225e-05, + "loss": 0.7326, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.36800808851318567, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6945, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4551362270717662, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7116, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3967014989182716, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6713, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.41810976389240995, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6636, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.39831162237046414, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7352, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.39152686726740915, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6797, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.3702035892320069, + "learning_rate": 3.519327394983888e-05, + "loss": 0.642, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4284658698643331, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7011, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.4508346320756152, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7802, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4061254031044715, + "learning_rate": 3.401671174289469e-05, + "loss": 0.74, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5494262708940839, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7405, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.39341045162109545, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7094, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3959131207736413, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6638, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.3694834374891536, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6627, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4243882003836683, + "learning_rate": 3.209137931341143e-05, + "loss": 0.7158, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.3714137914163032, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7098, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.42340250033689464, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6826, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.40476712971856527, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6807, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.4776919565263879, + "learning_rate": 3.058390171511196e-05, + "loss": 0.7328, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.36582983940998925, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6658, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.44933879235635416, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7881, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.38568764690834195, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6641, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4245513722777998, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6464, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.3599442476497355, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7028, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3974167086239789, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7233, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4024691565205865, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.68, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.45639176013861077, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7159, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.5678650813149774, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.8147, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.3655234186443611, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6329, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.3898767857363938, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6813, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.38366851029825955, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6019, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.41818273573782716, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6936, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3844323639396766, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6761, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.3782640038884151, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6773, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.43045786152928817, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6985, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.38243215655916374, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6819, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4222589671661771, + "learning_rate": 2.417867893002387e-05, + "loss": 0.7213, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4139986346359861, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7128, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.35693680243173304, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6891, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.42730598393904784, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.744, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.35254768958907134, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6475, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.41770151723486093, + "learning_rate": 2.251428928971102e-05, + "loss": 0.7098, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.45886649927793793, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6776, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.5696876266783285, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.773, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4323839065907935, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.772, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3566597351324305, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6078, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.40544828268521993, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7531, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.38565418463244233, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6449, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.3442537755701917, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6478, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.3874669780042446, + "learning_rate": 1.995999968955641e-05, + "loss": 0.7473, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4234105363998634, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6769, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.38179326044497314, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.695, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.38276100717171035, + "learning_rate": 1.903740076395151e-05, + "loss": 0.7053, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.4696229659642793, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.8268, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.8555868502851657, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6492, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.3947201336864822, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7087, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.43168055797308075, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6682, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3581726150858908, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6454, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.3762481743735936, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.686, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.40301944000895196, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7253, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.45011718247532684, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6537, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.5877291229030772, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6793, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.38161037625625915, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6606, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.39844438785050906, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6411, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.40349370889355574, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6796, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.42373407540587016, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6447, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5181792587601858, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7684, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.40229006497217784, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6519, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.5298666495639766, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.762, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3611511542028321, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.661, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.42073350377014856, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7973, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.33934604125514983, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6651, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3954898300065919, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6813, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.3859046411055772, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6729, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.41168273314796267, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.662, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.5329268906641638, + "learning_rate": 1.263034245443473e-05, + "loss": 0.677, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3751302125049051, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6476, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3414502913086143, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6855, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.39370008735547357, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6613, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4706464990815659, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7725, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.43440588684790993, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6818, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.4641564370485342, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6561, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3547950187068299, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6595, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.4017536231519369, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6846, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.36796335900360183, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7318, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.3835028765880275, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6391, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4431301777922845, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6863, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.39326649560923305, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6428, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.3960017804398548, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6963, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.3473501536168104, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6705, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.43796894089751437, + "learning_rate": 9.115273765538202e-06, + "loss": 0.8052, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.41778489241576106, + "learning_rate": 8.900250204211514e-06, + "loss": 0.7315, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4509501130995678, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7519, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.4218560005017982, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7389, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.5009005588292391, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6953, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.40176857746830935, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6098, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.36381722317326654, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6311, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.387649987787867, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6955, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.4089753780142505, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7396, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.39354628384238033, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6935, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3883473028832529, + "learning_rate": 7.07588486868922e-06, + "loss": 0.7029, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.42996333687152455, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6968, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4498999480247996, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6774, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.3914723199388177, + "learning_rate": 6.512524116523633e-06, + "loss": 0.7194, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3692776963674782, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6458, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.36240500344216386, + "learning_rate": 6.149504395842087e-06, + "loss": 0.7188, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4345620651272169, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6915, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.3949763813975571, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6907, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.48200077738797686, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6957, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.40812750082411603, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6722, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.36757540821350676, + "learning_rate": 5.286177068899989e-06, + "loss": 0.7202, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4209376972986613, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7454, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.49240819082330106, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6703, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.42052473011611863, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6981, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.38666076352459755, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6339, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.5103278801828982, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6758, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3950790980482145, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.658, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.37367772953751666, + "learning_rate": 4.184544329761009e-06, + "loss": 0.681, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.4722906048352251, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7039, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.3952571039628059, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6592, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4054839394569938, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7219, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.37853122272326006, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6838, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4388537566773106, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6672, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.4075987757543486, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.697, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3728087912376961, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6644, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.40000303295602, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6699, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3970615658991015, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6135, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4781533003822949, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6396, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.43818192697582575, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7082, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.6072326147781507, + "learning_rate": 2.590275647868867e-06, + "loss": 0.7814, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3991212490595163, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6503, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3763315614668962, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6345, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.43224625562244884, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6897, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.37483903559422577, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5833, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4294379674389417, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7418, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.3844121390612277, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6844, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.49491590543852876, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6699, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3817599359358462, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6983, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.388831097799436, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6938, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.4253964073682958, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.7043, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4019786521611652, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7003, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.41029257226138827, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6478, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.40006702059796145, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6721, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.35776307548767483, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6266, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.47105249718282655, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.701, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4317895828275148, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6946, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4054666687694706, + "learning_rate": 9.780089980330642e-07, + "loss": 0.7091, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.36373593809758037, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6332, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.39891340503921363, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6173, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3989415289853632, + "learning_rate": 7.730127636723539e-07, + "loss": 0.7076, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.37187945556387686, + "learning_rate": 7.100118211581852e-07, + "loss": 0.7114, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.39326199413024704, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6956, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.40517977262446664, + "learning_rate": 5.920169059947411e-07, + "loss": 0.7022, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.3530929889532916, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6503, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4051342459696674, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6113, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4031360752836336, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6781, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.372718805792164, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6969, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.40443611518221334, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6658, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.3957897106920901, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6951, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3675080072405774, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6447, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4363103927205831, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6462, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4000734569971733, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.7189, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.37295914146931103, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6138, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.4334530927924268, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.71, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.43964206335543465, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.7398, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.3556225139605389, + "learning_rate": 8.598886661895788e-08, + "loss": 0.681, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.35110072559816907, + "learning_rate": 6.583743778106887e-08, + "loss": 0.7005, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.39265638565024086, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6924, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.3693180236955692, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6265, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.39527655043634513, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6872, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.35127237654272503, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6414, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4139802895909404, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6779, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.46134657671608, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6679, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.4182580697871817, + "learning_rate": 0.0, + "loss": 0.6586, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 544733249077248.0, + "train_loss": 0.7485533044815064, + "train_runtime": 9694.5152, + "train_samples_per_second": 1.032, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 544733249077248.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f2e2de7d050474c95de7e20903e415b441132d0e --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "down_proj", + "o_proj", + "q_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1e34f68cc8d9f40f9e783441361461dfd6c1f6fc --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06435b9e6264f2bb52e7961096b6f7286ca1a123ba7ae9d2c7eecf42946d69c6 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..15e0cd93d2ff97671fd55d157ca92d2eed94f708 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36794d35ab6631bbdad14bb7c69ee506c4c65a344907995f0d57be1cb24555dd +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..581725a51e41a51a184c556483c2007943a24d75 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9313031558184078, + "learning_rate": 2e-05, + "loss": 1.5031, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8263843841694538, + "learning_rate": 4e-05, + "loss": 1.4033, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8911585947754022, + "learning_rate": 6e-05, + "loss": 1.4189, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7762495657811989, + "learning_rate": 8e-05, + "loss": 1.3086, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.9567510474105789, + "learning_rate": 0.0001, + "loss": 1.1303, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.081815042749168, + "learning_rate": 0.00012, + "loss": 1.0796, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8285018211345848, + "learning_rate": 0.00014, + "loss": 1.0065, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6531381930650225, + "learning_rate": 0.00016, + "loss": 0.91, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.48711720892258237, + "learning_rate": 0.00018, + "loss": 0.9225, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.4034594199818848, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.42072047599714224, + "learning_rate": 0.00019999458931878073, + "loss": 0.931, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5072449995615028, + "learning_rate": 0.0001999783578606323, + "loss": 0.8797, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.46380549861086345, + "learning_rate": 0.00019995130738201966, + "loss": 0.9179, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.385315677598143, + "learning_rate": 0.0001999134408101731, + "loss": 0.8593, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.46137284387864597, + "learning_rate": 0.00019986476224277165, + "loss": 0.9418, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.45126315868831723, + "learning_rate": 0.00019980527694749952, + "loss": 0.862, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.42373725126240985, + "learning_rate": 0.00019973499136147606, + "loss": 0.8423, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.36598981027567196, + "learning_rate": 0.0001996539130905593, + "loss": 0.8193, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.44213039096300505, + "learning_rate": 0.0001995620509085228, + "loss": 0.8726, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.39789444791501166, + "learning_rate": 0.00019945941475610623, + "loss": 0.8407, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4092142360498181, + "learning_rate": 0.0001993460157399396, + "loss": 0.881, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3468050800538684, + "learning_rate": 0.0001992218661313415, + "loss": 0.8674, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4233807018872381, + "learning_rate": 0.00019908697936499103, + "loss": 0.9065, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.33971286367406983, + "learning_rate": 0.00019894137003747403, + "loss": 0.7654, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.4046955760630767, + "learning_rate": 0.00019878505390570362, + "loss": 0.8813, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.34005001112275796, + "learning_rate": 0.00019861804788521493, + "loss": 0.8236, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4025294677541818, + "learning_rate": 0.00019844037004833473, + "loss": 0.8051, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.3686358863169206, + "learning_rate": 0.00019825203962222572, + "loss": 0.836, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.32331230707111247, + "learning_rate": 0.0001980530769868059, + "loss": 0.7774, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.31282451699321023, + "learning_rate": 0.00019784350367254322, + "loss": 0.7723, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.37439767878753266, + "learning_rate": 0.0001976233423581255, + "loss": 0.8478, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.32333681034525386, + "learning_rate": 0.0001973926168680066, + "loss": 0.7865, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4432036670007486, + "learning_rate": 0.00019715135216982798, + "loss": 0.8627, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.37900974331118864, + "learning_rate": 0.0001968995743717171, + "loss": 0.8027, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.36170264976324784, + "learning_rate": 0.00019663731071946206, + "loss": 0.7984, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.3283349748815554, + "learning_rate": 0.00019636458959356316, + "loss": 0.7789, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.38010110559542276, + "learning_rate": 0.0001960814405061619, + "loss": 0.7616, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.35779184963693483, + "learning_rate": 0.00019578789409784727, + "loss": 0.8295, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3188972855562048, + "learning_rate": 0.00019548398213434007, + "loss": 0.7941, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.33885595331420143, + "learning_rate": 0.00019516973750305532, + "loss": 0.761, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.6599626462124875, + "learning_rate": 0.00019484519420954354, + "loss": 0.8581, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.30590437571330525, + "learning_rate": 0.00019451038737381077, + "loss": 0.7641, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.32451038077710237, + "learning_rate": 0.00019416535322651818, + "loss": 0.7923, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.31066761640154694, + "learning_rate": 0.00019381012910506146, + "loss": 0.7954, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.3138858814462281, + "learning_rate": 0.00019344475344953012, + "loss": 0.802, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.32614645591571356, + "learning_rate": 0.00019306926579854821, + "loss": 0.7911, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3024007790436311, + "learning_rate": 0.00019268370678499533, + "loss": 0.8083, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.30440235198263627, + "learning_rate": 0.0001922881181316097, + "loss": 0.7668, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.352744802698165, + "learning_rate": 0.00019188254264647337, + "loss": 0.8415, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.31658794467190116, + "learning_rate": 0.0001914670242183795, + "loss": 0.8017, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.31492856970059707, + "learning_rate": 0.0001910416078120832, + "loss": 0.7642, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3537817488504811, + "learning_rate": 0.0001906063394634356, + "loss": 0.8039, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3148253491813399, + "learning_rate": 0.00019016126627440237, + "loss": 0.7959, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.31850399118450406, + "learning_rate": 0.00018970643640796642, + "loss": 0.8316, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.3056092085262738, + "learning_rate": 0.000189241899082916, + "loss": 0.7915, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.30536451054961206, + "learning_rate": 0.00018876770456851877, + "loss": 0.7883, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.34941693898355236, + "learning_rate": 0.0001882839041790818, + "loss": 0.7571, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.296880968864248, + "learning_rate": 0.00018779055026839868, + "loss": 0.7288, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3322679426381215, + "learning_rate": 0.00018728769622408423, + "loss": 0.8012, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.33260727442082016, + "learning_rate": 0.00018677539646179707, + "loss": 0.7452, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3081599535574942, + "learning_rate": 0.00018625370641935129, + "loss": 0.7559, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3244889807411165, + "learning_rate": 0.00018572268255071718, + "loss": 0.7584, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.348431414936007, + "learning_rate": 0.00018518238231991218, + "loss": 0.8334, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.33040662982295277, + "learning_rate": 0.00018463286419478255, + "loss": 0.7763, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.304768597360976, + "learning_rate": 0.00018407418764067627, + "loss": 0.7595, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.32114990202854404, + "learning_rate": 0.00018350641311400812, + "loss": 0.7826, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.34670888584990867, + "learning_rate": 0.0001829296020557174, + "loss": 0.8071, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3135050010328246, + "learning_rate": 0.00018234381688461942, + "loss": 0.7591, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4007165410841229, + "learning_rate": 0.0001817491209906506, + "loss": 0.8222, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.3098165798346559, + "learning_rate": 0.00018114557872800905, + "loss": 0.8276, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3366496343563119, + "learning_rate": 0.00018053325540819045, + "loss": 0.74, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3446479624854017, + "learning_rate": 0.0001799122172929206, + "loss": 0.8105, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.32538947912217897, + "learning_rate": 0.00017928253158698473, + "loss": 0.7902, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3382513419761104, + "learning_rate": 0.0001786442664309554, + "loss": 0.7953, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.2902528244962728, + "learning_rate": 0.0001779974908938184, + "loss": 0.7831, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.33295380789582807, + "learning_rate": 0.0001773422749654988, + "loss": 0.7932, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.32491079311016224, + "learning_rate": 0.00017667868954928694, + "loss": 0.7834, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3219914116664492, + "learning_rate": 0.00017600680645416583, + "loss": 0.751, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.30558099102554004, + "learning_rate": 0.00017532669838704035, + "loss": 0.7302, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.30736544484761913, + "learning_rate": 0.00017463843894486937, + "loss": 0.7667, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.34356528061341995, + "learning_rate": 0.0001739421026067017, + "loss": 0.8141, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.32407625571977955, + "learning_rate": 0.00017323776472561627, + "loss": 0.739, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.31730106115240775, + "learning_rate": 0.00017252550152056795, + "loss": 0.7717, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.3242457387120635, + "learning_rate": 0.0001718053900681397, + "loss": 0.776, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.37387503794119153, + "learning_rate": 0.00017107750829420176, + "loss": 0.7173, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.36544910247697354, + "learning_rate": 0.00017034193496547902, + "loss": 0.7854, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.3280297159655621, + "learning_rate": 0.00016959874968102735, + "loss": 0.7668, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3060306803673194, + "learning_rate": 0.00016884803286362, + "loss": 0.7784, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.340343694183801, + "learning_rate": 0.00016808986575104465, + "loss": 0.8023, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.33147507262260134, + "learning_rate": 0.00016732433038731242, + "loss": 0.7603, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.29967307880987004, + "learning_rate": 0.0001665515096137797, + "loss": 0.7351, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3070447135297137, + "learning_rate": 0.00016577148706018328, + "loss": 0.8144, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3371468656483107, + "learning_rate": 0.00016498434713559088, + "loss": 0.7643, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3301544042290975, + "learning_rate": 0.00016419017501926656, + "loss": 0.7455, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.35732461105198227, + "learning_rate": 0.0001633890566514535, + "loss": 0.7399, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3372671710177106, + "learning_rate": 0.00016258107872407375, + "loss": 0.7744, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.32368170886745823, + "learning_rate": 0.0001617663286713474, + "loss": 0.7558, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.347088503534774, + "learning_rate": 0.00016094489466033043, + "loss": 0.7792, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3098281533746313, + "learning_rate": 0.00016011686558137448, + "loss": 0.7418, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.29763542841261936, + "learning_rate": 0.0001592823310385073, + "loss": 0.7411, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3131951129561186, + "learning_rate": 0.0001584413813397364, + "loss": 0.7384, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.33576606144185733, + "learning_rate": 0.00015759410748727662, + "loss": 0.7822, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.29032650311854463, + "learning_rate": 0.00015674060116770236, + "loss": 0.7639, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.33954362645047376, + "learning_rate": 0.00015588095474202595, + "loss": 0.7833, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.302213671546099, + "learning_rate": 0.00015501526123570277, + "loss": 0.7412, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3472468128228165, + "learning_rate": 0.00015414361432856475, + "loss": 0.7713, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.3009835196525144, + "learning_rate": 0.0001532661083446829, + "loss": 0.7241, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.31496230188332786, + "learning_rate": 0.00015238283824216015, + "loss": 0.7087, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.32468960647814604, + "learning_rate": 0.00015149389960285558, + "loss": 0.7866, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.35142485947695556, + "learning_rate": 0.00015059938862204127, + "loss": 0.7743, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.2984111047966567, + "learning_rate": 0.00014969940209799248, + "loss": 0.7115, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3122067236813943, + "learning_rate": 0.00014879403742151283, + "loss": 0.7541, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3061309555680417, + "learning_rate": 0.00014788339256539544, + "loss": 0.7588, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3330261141466801, + "learning_rate": 0.0001469675660738206, + "loss": 0.7725, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.2878265748509493, + "learning_rate": 0.00014604665705169237, + "loss": 0.7239, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.32402311127004874, + "learning_rate": 0.00014512076515391375, + "loss": 0.7572, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.2885443877226311, + "learning_rate": 0.00014418999057460276, + "loss": 0.7109, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3325984376260274, + "learning_rate": 0.0001432544340362501, + "loss": 0.7745, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.35276302988307434, + "learning_rate": 0.00014231419677881966, + "loss": 0.6814, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.3743223954364047, + "learning_rate": 0.00014136938054879283, + "loss": 0.8025, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.31250095434003256, + "learning_rate": 0.00014042008758815818, + "loss": 0.7543, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.27751291074535644, + "learning_rate": 0.00013946642062334766, + "loss": 0.7146, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.2916739695563485, + "learning_rate": 0.00013850848285411994, + "loss": 0.7413, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.30375483290863786, + "learning_rate": 0.000137546377942393, + "loss": 0.7346, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.33201388009152266, + "learning_rate": 0.00013658021000102636, + "loss": 0.7116, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3372896307941712, + "learning_rate": 0.00013561008358255468, + "loss": 0.7415, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3061689590079631, + "learning_rate": 0.00013463610366787392, + "loss": 0.7273, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.31379168320638867, + "learning_rate": 0.00013365837565488064, + "loss": 0.7291, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.2934848153355263, + "learning_rate": 0.0001326770053470668, + "loss": 0.7222, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3441301895087707, + "learning_rate": 0.0001316920989420703, + "loss": 0.7521, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.33242144922282324, + "learning_rate": 0.00013070376302018287, + "loss": 0.758, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2998764306620887, + "learning_rate": 0.00012971210453281674, + "loss": 0.7362, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.35453134249708435, + "learning_rate": 0.000128717230790931, + "loss": 0.8094, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.318190568854741, + "learning_rate": 0.00012771924945341906, + "loss": 0.7685, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.2924483481496768, + "learning_rate": 0.00012671826851545851, + "loss": 0.701, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.28860962608238744, + "learning_rate": 0.0001257143962968246, + "loss": 0.7048, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3223093593911342, + "learning_rate": 0.00012470774143016853, + "loss": 0.6787, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.32652124911022334, + "learning_rate": 0.00012369841284926188, + "loss": 0.7751, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3378567546264251, + "learning_rate": 0.00012268651977720866, + "loss": 0.7057, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.3347874323081615, + "learning_rate": 0.00012167217171462566, + "loss": 0.7427, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.2950258859285516, + "learning_rate": 0.0001206554784277931, + "loss": 0.7103, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.29929474750451557, + "learning_rate": 0.00011963654993677645, + "loss": 0.7159, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3499940407779636, + "learning_rate": 0.00011861549650352069, + "loss": 0.7881, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.30145031584516635, + "learning_rate": 0.00011759242861991855, + "loss": 0.716, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.38401430285844806, + "learning_rate": 0.00011656745699585371, + "loss": 0.7446, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.33361204379628423, + "learning_rate": 0.00011554069254722051, + "loss": 0.7265, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.322019046159023, + "learning_rate": 0.00011451224638392129, + "loss": 0.7728, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.30675719855675915, + "learning_rate": 0.00011348222979784289, + "loss": 0.7094, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.31580729976842725, + "learning_rate": 0.00011245075425081328, + "loss": 0.7369, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.28023263956373423, + "learning_rate": 0.00011141793136253986, + "loss": 0.7057, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3095266279847274, + "learning_rate": 0.0001103838728985307, + "loss": 0.7128, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.30435117677738777, + "learning_rate": 0.000109348690758, + "loss": 0.7652, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.2750215878076213, + "learning_rate": 0.00010831249696175918, + "loss": 0.683, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.29938683531824195, + "learning_rate": 0.0001072754036400944, + "loss": 0.7781, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.2739445261182683, + "learning_rate": 0.00010623752302063283, + "loss": 0.7034, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3032647494750609, + "learning_rate": 0.00010519896741619803, + "loss": 0.733, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3588377553282501, + "learning_rate": 0.00010415984921265609, + "loss": 0.7565, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3252058405483004, + "learning_rate": 0.00010312028085675391, + "loss": 0.7689, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.33937386250897783, + "learning_rate": 0.00010208037484395114, + "loss": 0.7685, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.2748694284336673, + "learning_rate": 0.00010104024370624644, + "loss": 0.7181, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.30939208531233414, + "learning_rate": 0.0001, + "loss": 0.7431, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2738534135072514, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7042, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3974092001806277, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7634, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.298415119173645, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7466, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.30699336831051954, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7284, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2901068669142659, + "learning_rate": 9.480103258380198e-05, + "loss": 0.719, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.289122044479363, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7059, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3030799207029953, + "learning_rate": 9.272459635990562e-05, + "loss": 0.6867, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3027641169951838, + "learning_rate": 9.168750303824084e-05, + "loss": 0.6983, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.29915201634856275, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7066, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.33877171450167437, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7477, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3019004859637966, + "learning_rate": 8.858206863746018e-05, + "loss": 0.6896, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.32101987397698073, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7732, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.29608955613782456, + "learning_rate": 8.651777020215712e-05, + "loss": 0.6975, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.3903341635561156, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7325, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2846065516794577, + "learning_rate": 8.445930745277953e-05, + "loss": 0.6659, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.27564468788525615, + "learning_rate": 8.343254300414628e-05, + "loss": 0.6608, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.27705456068017625, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7026, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.27581471128823365, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7589, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.297802511066448, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7905, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3452322304584979, + "learning_rate": 7.934452157220694e-05, + "loss": 0.7608, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.2928432329233493, + "learning_rate": 7.832782828537437e-05, + "loss": 0.6883, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.29460004736129036, + "learning_rate": 7.731348022279134e-05, + "loss": 0.6631, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.28123407758344715, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7508, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.2843635254634615, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7282, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.34889895491799827, + "learning_rate": 7.428560370317542e-05, + "loss": 0.7743, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3425113999228756, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7221, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.30595814993670367, + "learning_rate": 7.228075054658096e-05, + "loss": 0.733, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.35840367615756424, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7561, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.33104462323668715, + "learning_rate": 7.028789546718326e-05, + "loss": 0.6991, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3394753446738843, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7099, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.32283853853312183, + "learning_rate": 6.830790105792973e-05, + "loss": 0.687, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3049519469814839, + "learning_rate": 6.732299465293322e-05, + "loss": 0.6855, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3095509339891271, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7694, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.2854844000449528, + "learning_rate": 6.536389633212609e-05, + "loss": 0.6779, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.30525546650522767, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7081, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.2833182748533542, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7224, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.2629180405414573, + "learning_rate": 6.245362205760704e-05, + "loss": 0.6747, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3037186612742775, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7015, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.2579515333812123, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6525, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3015359549294114, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7437, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.30328608937618484, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7262, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3131312484861202, + "learning_rate": 5.768580322118034e-05, + "loss": 0.6913, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3428352859498749, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7192, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.32744225245031866, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.681, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.2789381341996287, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7045, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.29037036499383584, + "learning_rate": 5.395334294830765e-05, + "loss": 0.6812, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.2985271031320008, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.6829, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3011009096610186, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7078, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.303730403749863, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7333, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.2700747120155797, + "learning_rate": 5.030059790200756e-05, + "loss": 0.6845, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.2945402305786069, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7313, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3291065136259577, + "learning_rate": 4.850610039714444e-05, + "loss": 0.706, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3089232604005034, + "learning_rate": 4.761716175783989e-05, + "loss": 0.6608, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.29043173518454235, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7111, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3072972304823298, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7309, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.29575293212282944, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6941, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.272191115063121, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7105, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.29872578885296647, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7073, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.30797164640532304, + "learning_rate": 4.240589251272342e-05, + "loss": 0.6731, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.30888955330178547, + "learning_rate": 4.155861866026364e-05, + "loss": 0.6875, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3011025807384547, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7361, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.2634763705716138, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6584, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.27702377174703136, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7055, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.4738064858879691, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7186, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3216294244730204, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7234, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.31551523504983137, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7019, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.29166142886120305, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7063, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.27317702545893363, + "learning_rate": 3.501565286440914e-05, + "loss": 0.668, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.3192281891362774, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7469, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.35440201327111076, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7452, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.28502204185604074, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.6923, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.29945490920230516, + "learning_rate": 3.191013424895536e-05, + "loss": 0.6925, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.29575834955378427, + "learning_rate": 3.115196713638e-05, + "loss": 0.7027, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.3291841843584088, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7127, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.28583705498573, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7333, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3181846960491521, + "learning_rate": 2.892249170579826e-05, + "loss": 0.658, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.2819914163211931, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7208, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.31755915879086516, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7026, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.36200326820967205, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7343, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.27246946326238036, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.6493, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.30218563568514123, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6929, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.28757076182540703, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6935, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.27698386751868853, + "learning_rate": 2.399319354583418e-05, + "loss": 0.7087, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.28280472449769867, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7087, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.2651273458522423, + "learning_rate": 2.265772503450122e-05, + "loss": 0.6997, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.33121071922944717, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7048, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.38190444874022617, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7801, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.27849697998142847, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.6905, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.26362848291419816, + "learning_rate": 2.008778270707944e-05, + "loss": 0.6528, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.29590890139488923, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7182, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.2842179617272862, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7079, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3220612731779135, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7432, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.31266632092760416, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6949, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.28786957578583217, + "learning_rate": 1.707039794428259e-05, + "loss": 0.6744, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.29790664374884945, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6948, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3172355195642624, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6821, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.2908686579144822, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.6693, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3241058543780178, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7154, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.3341389142195567, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7114, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.27604065366111885, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7347, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.26797015474074914, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.6808, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.26746248447865817, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.6762, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.26681525542349926, + "learning_rate": 1.220944973160133e-05, + "loss": 0.6696, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.2788077513972515, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6846, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.32515261075444146, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7328, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.283302708290937, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.6677, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.27776316297126497, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7168, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3150518012713109, + "learning_rate": 9.838733725597615e-06, + "loss": 0.6685, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.28243642250546974, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6789, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.31271458346800596, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7471, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.3162378069331692, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7485, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3326107943100678, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7241, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2881515697148622, + "learning_rate": 7.711881868390291e-06, + "loss": 0.6297, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.29488227027303954, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7256, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.2820180332448235, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7046, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.30787216006833723, + "learning_rate": 6.555246550469907e-06, + "loss": 0.6933, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3034242260555174, + "learning_rate": 6.189870894938587e-06, + "loss": 0.6961, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.34308439843199623, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7172, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.32591428798684924, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7055, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.2900530208408788, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7069, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3347468750592578, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7138, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.2977111126537549, + "learning_rate": 4.516017865659949e-06, + "loss": 0.6741, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.2874759354056903, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6745, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.32717632776157757, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7036, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.2965440455424347, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.6976, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3032387027931161, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.6848, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.2757262556517274, + "learning_rate": 3.100425628282899e-06, + "loss": 0.6848, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.28723381852699836, + "learning_rate": 2.848647830172024e-06, + "loss": 0.6497, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.3411539336294208, + "learning_rate": 2.607383131993424e-06, + "loss": 0.6842, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3831063367203094, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7287, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3119361218684643, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6687, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.29122409002191013, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6698, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.2933627610749106, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.6876, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.2781149714045268, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7088, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3046275387540745, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7088, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3000535326506705, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6684, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3034992720452355, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6749, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.2978087268151753, + "learning_rate": 9.130206350089765e-07, + "loss": 0.71, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.2943213594302957, + "learning_rate": 7.781338686584927e-07, + "loss": 0.631, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.2837772097273212, + "learning_rate": 6.539842600603918e-07, + "loss": 0.7203, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.27747160009113364, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7056, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.3021156038593126, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.6424, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.28447776946219105, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6965, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.2904170570582678, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.688, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.28693854411619896, + "learning_rate": 1.947230525005006e-07, + "loss": 0.6536, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.2830799509701968, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.6752, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.43784320401705396, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7293, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.25592207413800777, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.6987, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.2750153014419686, + "learning_rate": 2.164213936770576e-08, + "loss": 0.6692, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.2684110083374372, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6751, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.32660971726192795, + "learning_rate": 0.0, + "loss": 0.6829, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 795733460516864.0, + "train_loss": 0.7523357088749225, + "train_runtime": 9661.7281, + "train_samples_per_second": 1.035, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 795733460516864.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e919187fbf913713e69fb440e8110e4d25b1ac30 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "q_proj", + "down_proj", + "gate_proj", + "o_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b62957b9c7840f6cb568c86e2863a0f225c3555 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9d1227e4dd9a0cfde2fe69b24b286992bbae19535fa882a8f96b47d953c9801 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..a8da24431df71599ffeb6b080b8db6bc7a04c380 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fef31742711d8236f4f4c850e0934aec980db88afeb35d572b9a68885e93e357 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f69d43c2a1ec9fb65d169acf964409d47d0ae3b --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 0.9307507358110347, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.3678, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 1.1911510039507764, + "learning_rate": 2.105263157894737e-05, + "loss": 1.5739, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 1.1329261461162867, + "learning_rate": 3.157894736842105e-05, + "loss": 1.5704, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7765093235427704, + "learning_rate": 4.210526315789474e-05, + "loss": 1.2855, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.8969381600574139, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2613, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8953363951786134, + "learning_rate": 6.31578947368421e-05, + "loss": 1.192, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.9418089360353965, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1738, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.3012452150233644, + "learning_rate": 8.421052631578948e-05, + "loss": 1.0476, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.8321289166655941, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9973, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.8764791327346797, + "learning_rate": 0.00010526315789473685, + "loss": 1.0245, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6311716463900636, + "learning_rate": 0.00011578947368421053, + "loss": 0.9404, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6250088287070071, + "learning_rate": 0.0001263157894736842, + "loss": 0.9513, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5785281004669424, + "learning_rate": 0.0001368421052631579, + "loss": 0.9685, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.626495141490426, + "learning_rate": 0.00014736842105263158, + "loss": 0.947, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.5326607517261951, + "learning_rate": 0.00015789473684210527, + "loss": 0.8805, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5238320523852994, + "learning_rate": 0.00016842105263157895, + "loss": 0.9079, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.6599520510016996, + "learning_rate": 0.00017894736842105264, + "loss": 0.9736, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5935929471408434, + "learning_rate": 0.00018947368421052632, + "loss": 0.9538, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.5214400161127966, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.49158434527043327, + "learning_rate": 0.00019999865623437013, + "loss": 0.8529, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5761943028543544, + "learning_rate": 0.00019999462497359466, + "loss": 0.8389, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5240754907632433, + "learning_rate": 0.00019998790632601496, + "loss": 0.8684, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.6319870835376165, + "learning_rate": 0.0001999785004721968, + "loss": 0.8933, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4398463654248781, + "learning_rate": 0.00019996640766492543, + "loss": 0.8404, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.5354998313987915, + "learning_rate": 0.00019995162822919883, + "loss": 0.8627, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5296954730016616, + "learning_rate": 0.00019993416256221895, + "loss": 0.9904, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5545084227982735, + "learning_rate": 0.00019991401113338104, + "loss": 0.827, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.49894046941791115, + "learning_rate": 0.00019989117448426108, + "loss": 0.8063, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.7330695616422173, + "learning_rate": 0.00019986565322860115, + "loss": 0.8793, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.5504392087500234, + "learning_rate": 0.00019983744805229296, + "loss": 0.9445, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5138771247819872, + "learning_rate": 0.00019980655971335945, + "loss": 0.9532, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5357310231971444, + "learning_rate": 0.00019977298904193437, + "loss": 0.8373, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.436458649534939, + "learning_rate": 0.00019973673694024, + "loss": 0.8142, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.43637696361672595, + "learning_rate": 0.00019969780438256293, + "loss": 0.8147, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.47769382880518035, + "learning_rate": 0.0001996561924152278, + "loss": 0.8219, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5691186156905623, + "learning_rate": 0.0001996119021565693, + "loss": 0.8832, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4403064208176803, + "learning_rate": 0.0001995649347969019, + "loss": 0.8122, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.43879736958934057, + "learning_rate": 0.00019951529159848805, + "loss": 0.8792, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4950759543534755, + "learning_rate": 0.00019946297389550433, + "loss": 0.9137, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.5544219940135219, + "learning_rate": 0.00019940798309400526, + "loss": 0.7792, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5696165767831806, + "learning_rate": 0.0001993503206718859, + "loss": 0.8525, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.8056279621643284, + "learning_rate": 0.00019928998817884182, + "loss": 0.8261, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.645373156488571, + "learning_rate": 0.00019922698723632767, + "loss": 0.8033, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5809821811726844, + "learning_rate": 0.00019916131953751342, + "loss": 1.0064, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.5237957226770983, + "learning_rate": 0.00019909298684723904, + "loss": 0.8589, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4787740915558479, + "learning_rate": 0.00019902199100196697, + "loss": 0.8778, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.45753284287165796, + "learning_rate": 0.00019894833390973266, + "loss": 0.81, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.9398745375460666, + "learning_rate": 0.00019887201755009357, + "loss": 0.8323, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4529340832996081, + "learning_rate": 0.0001987930439740757, + "loss": 0.8218, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.666389495376452, + "learning_rate": 0.00019871141530411853, + "loss": 0.8682, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.512483360726018, + "learning_rate": 0.0001986271337340182, + "loss": 0.8956, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4615015180628129, + "learning_rate": 0.00019854020152886814, + "loss": 0.8119, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.41822728845913026, + "learning_rate": 0.0001984506210249986, + "loss": 0.781, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.439843201047384, + "learning_rate": 0.00019835839462991361, + "loss": 0.8696, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.4296610243379561, + "learning_rate": 0.00019826352482222638, + "loss": 0.8211, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5155399902660266, + "learning_rate": 0.00019816601415159263, + "loss": 0.8404, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4662602814333803, + "learning_rate": 0.0001980658652386421, + "loss": 0.8231, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4269443465627204, + "learning_rate": 0.00019796308077490817, + "loss": 0.8061, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.5868211754520536, + "learning_rate": 0.00019785766352275542, + "loss": 0.8667, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.5401700531380792, + "learning_rate": 0.00019774961631530545, + "loss": 0.8249, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.5275268741042621, + "learning_rate": 0.00019763894205636072, + "loss": 0.8194, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4950585563274784, + "learning_rate": 0.00019752564372032657, + "loss": 0.7906, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4594902337967978, + "learning_rate": 0.00019740972435213115, + "loss": 0.845, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5555461336900863, + "learning_rate": 0.00019729118706714375, + "loss": 0.8507, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.46015831447117106, + "learning_rate": 0.00019717003505109095, + "loss": 0.8132, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.43769054593724865, + "learning_rate": 0.00019704627155997108, + "loss": 0.8251, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.4309759425171532, + "learning_rate": 0.00019691989991996663, + "loss": 0.8562, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4556328773535883, + "learning_rate": 0.0001967909235273549, + "loss": 0.7385, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.564216800136952, + "learning_rate": 0.00019665934584841682, + "loss": 0.8794, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.5820634576821899, + "learning_rate": 0.00019652517041934356, + "loss": 0.8319, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.49068764874375215, + "learning_rate": 0.00019638840084614182, + "loss": 0.8562, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4262934451493423, + "learning_rate": 0.00019624904080453655, + "loss": 0.7794, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.43154854225837036, + "learning_rate": 0.00019610709403987246, + "loss": 0.7582, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.41467307557921185, + "learning_rate": 0.00019596256436701324, + "loss": 0.8181, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.43925945137208655, + "learning_rate": 0.000195815455670239, + "loss": 0.7741, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.46227512591367387, + "learning_rate": 0.00019566577190314197, + "loss": 0.7864, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4419905355200034, + "learning_rate": 0.0001955135170885202, + "loss": 0.7695, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5165657370156446, + "learning_rate": 0.00019535869531826937, + "loss": 0.7723, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.4403033444346165, + "learning_rate": 0.00019520131075327298, + "loss": 0.8572, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.47897736403075963, + "learning_rate": 0.00019504136762329047, + "loss": 0.8189, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4599115930160895, + "learning_rate": 0.00019487887022684336, + "loss": 0.8252, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.40066554938838556, + "learning_rate": 0.00019471382293110003, + "loss": 0.7582, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4984653951622605, + "learning_rate": 0.00019454623017175812, + "loss": 0.8255, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5397071279329464, + "learning_rate": 0.00019437609645292546, + "loss": 0.8245, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.42163317045894194, + "learning_rate": 0.0001942034263469989, + "loss": 0.7135, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4888518851875543, + "learning_rate": 0.00019402822449454153, + "loss": 0.7642, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.531963899827601, + "learning_rate": 0.00019385049560415794, + "loss": 0.9528, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4354611813055882, + "learning_rate": 0.00019367024445236754, + "loss": 0.784, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4048432684336353, + "learning_rate": 0.00019348747588347637, + "loss": 0.8194, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.4086241078858841, + "learning_rate": 0.00019330219480944694, + "loss": 0.7283, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4336295842212704, + "learning_rate": 0.00019311440620976597, + "loss": 0.7621, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5431483909710855, + "learning_rate": 0.0001929241151313108, + "loss": 0.9061, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.42167546766193187, + "learning_rate": 0.00019273132668821364, + "loss": 0.7601, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.49652526503168953, + "learning_rate": 0.00019253604606172417, + "loss": 0.7868, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.5042251597152386, + "learning_rate": 0.00019233827850007027, + "loss": 0.8008, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4660459696307034, + "learning_rate": 0.00019213802931831696, + "loss": 0.8621, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.5278479381348781, + "learning_rate": 0.00019193530389822363, + "loss": 0.8426, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5136081800388119, + "learning_rate": 0.00019173010768809933, + "loss": 0.7843, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.49722508941229265, + "learning_rate": 0.0001915224462026563, + "loss": 0.7822, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.5125088004208799, + "learning_rate": 0.00019131232502286188, + "loss": 0.8668, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.5477111636891939, + "learning_rate": 0.0001910997497957885, + "loss": 0.806, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5042188986966278, + "learning_rate": 0.00019088472623446183, + "loss": 0.7822, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.4152580378172454, + "learning_rate": 0.00019066726011770726, + "loss": 0.775, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.43906996296378337, + "learning_rate": 0.0001904473572899947, + "loss": 0.7744, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.43539466027214696, + "learning_rate": 0.00019022502366128135, + "loss": 0.8257, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4239378487862455, + "learning_rate": 0.00019000026520685302, + "loss": 0.7692, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.533212030471839, + "learning_rate": 0.0001897730879671634, + "loss": 0.7879, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.41851795066862274, + "learning_rate": 0.00018954349804767184, + "loss": 0.789, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4936224317687835, + "learning_rate": 0.00018931150161867916, + "loss": 0.7451, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.4150792579010742, + "learning_rate": 0.00018907710491516199, + "loss": 0.783, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.49955970550870915, + "learning_rate": 0.0001888403142366049, + "loss": 0.7708, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.46522933096843166, + "learning_rate": 0.00018860113594683148, + "loss": 0.8511, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5077447088384153, + "learning_rate": 0.00018835957647383303, + "loss": 0.8148, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.461837784752004, + "learning_rate": 0.00018811564230959588, + "loss": 0.8102, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.5274874349470041, + "learning_rate": 0.00018786934000992688, + "loss": 0.8419, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5119441434124525, + "learning_rate": 0.00018762067619427746, + "loss": 0.8212, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.43939980505918763, + "learning_rate": 0.00018736965754556528, + "loss": 0.7677, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5049785758789185, + "learning_rate": 0.00018711629080999504, + "loss": 0.7267, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4832395598923605, + "learning_rate": 0.00018686058279687698, + "loss": 0.7773, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.5285882253880008, + "learning_rate": 0.00018660254037844388, + "loss": 0.7878, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.44652155281764777, + "learning_rate": 0.00018634217048966637, + "loss": 0.7555, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4384116771377595, + "learning_rate": 0.0001860794801280666, + "loss": 0.7618, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.4503216438102163, + "learning_rate": 0.0001858144763535302, + "loss": 0.8336, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.45399287100490054, + "learning_rate": 0.0001855471662881164, + "loss": 0.7825, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.4911803310924581, + "learning_rate": 0.00018527755711586678, + "loss": 0.7849, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4475560318620542, + "learning_rate": 0.00018500565608261214, + "loss": 0.7986, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5410613805002472, + "learning_rate": 0.00018473147049577774, + "loss": 0.7925, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3728770521983605, + "learning_rate": 0.00018445500772418697, + "loss": 0.6947, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4041103912264637, + "learning_rate": 0.00018417627519786315, + "loss": 0.7613, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.5740518300279095, + "learning_rate": 0.00018389528040783012, + "loss": 0.8826, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5212422666013237, + "learning_rate": 0.00018361203090591071, + "loss": 0.8067, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4553858820536246, + "learning_rate": 0.00018332653430452376, + "loss": 0.7779, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4990540070803385, + "learning_rate": 0.00018303879827647975, + "loss": 0.8145, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4100061474777309, + "learning_rate": 0.00018274883055477436, + "loss": 0.7635, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.45392924863684636, + "learning_rate": 0.00018245663893238075, + "loss": 0.7389, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.43829101817979416, + "learning_rate": 0.00018216223126204007, + "loss": 0.7486, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.4467052785057778, + "learning_rate": 0.00018186561545605054, + "loss": 0.7947, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4616964853806264, + "learning_rate": 0.00018156679948605467, + "loss": 0.8073, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4281905319998942, + "learning_rate": 0.00018126579138282503, + "loss": 0.7307, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.4473416989554217, + "learning_rate": 0.0001809625992360485, + "loss": 0.7899, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4217308286391299, + "learning_rate": 0.00018065723119410884, + "loss": 0.7788, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4546622820724357, + "learning_rate": 0.00018034969546386757, + "loss": 0.8489, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4291947161668821, + "learning_rate": 0.0001800400003104436, + "loss": 0.8209, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4421224544920921, + "learning_rate": 0.00017972815405699103, + "loss": 0.7657, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.43378881989450496, + "learning_rate": 0.00017941416508447536, + "loss": 0.7967, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.42238741761879933, + "learning_rate": 0.0001790980418314484, + "loss": 0.7537, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.4152582962478556, + "learning_rate": 0.00017877979279382135, + "loss": 0.7753, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4330144364111337, + "learning_rate": 0.0001784594265246366, + "loss": 0.7962, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4108809995226594, + "learning_rate": 0.0001781369516338378, + "loss": 0.7966, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.40598672641581, + "learning_rate": 0.00017781237678803847, + "loss": 0.7633, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4167982043506597, + "learning_rate": 0.000177485710710289, + "loss": 0.7041, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.42014200520014094, + "learning_rate": 0.00017715696217984235, + "loss": 0.7193, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.4573788195799367, + "learning_rate": 0.00017682614003191807, + "loss": 0.7512, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4803142053827793, + "learning_rate": 0.00017649325315746478, + "loss": 0.8012, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.3632582425441092, + "learning_rate": 0.0001761583105029213, + "loss": 0.7589, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4271821317661616, + "learning_rate": 0.00017582132106997616, + "loss": 0.6982, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.5370692376133122, + "learning_rate": 0.00017548229391532572, + "loss": 0.8071, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4204907018652622, + "learning_rate": 0.00017514123815043074, + "loss": 0.785, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4511978833789513, + "learning_rate": 0.00017479816294127152, + "loss": 0.7735, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.38997839389330263, + "learning_rate": 0.0001744530775081015, + "loss": 0.7175, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.45712195717321497, + "learning_rate": 0.0001741059911251997, + "loss": 0.7999, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.409363952347818, + "learning_rate": 0.000173756913120621, + "loss": 0.7441, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4216699239112223, + "learning_rate": 0.00017340585287594604, + "loss": 0.7585, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4203526658961756, + "learning_rate": 0.0001730528198260285, + "loss": 0.6942, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.4316640766886136, + "learning_rate": 0.00017269782345874203, + "loss": 0.7799, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3918992414000058, + "learning_rate": 0.00017234087331472497, + "loss": 0.7611, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.480952025030275, + "learning_rate": 0.00017198197898712404, + "loss": 0.807, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.48029330507987417, + "learning_rate": 0.00017162115012133643, + "loss": 0.7873, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5007752530950041, + "learning_rate": 0.00017125839641475072, + "loss": 0.8153, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.41193134414047006, + "learning_rate": 0.00017089372761648616, + "loss": 0.7823, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.3705783893043457, + "learning_rate": 0.00017052715352713075, + "loss": 0.6882, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5364714578308167, + "learning_rate": 0.00017015868399847768, + "loss": 0.8123, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4304840396947207, + "learning_rate": 0.00016978832893326074, + "loss": 0.7226, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.39176723381890255, + "learning_rate": 0.00016941609828488807, + "loss": 0.6553, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.473933800845934, + "learning_rate": 0.0001690420020571747, + "loss": 0.8249, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4783955117626432, + "learning_rate": 0.0001686660503040737, + "loss": 0.7417, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.43812649136651893, + "learning_rate": 0.00016828825312940592, + "loss": 0.7362, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.42093229013970423, + "learning_rate": 0.0001679086206865886, + "loss": 0.7559, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.43412761063692157, + "learning_rate": 0.00016752716317836229, + "loss": 0.7668, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.4114487717141203, + "learning_rate": 0.0001671438908565167, + "loss": 0.8001, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.5174273889532307, + "learning_rate": 0.00016675881402161536, + "loss": 0.737, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.42200383733853786, + "learning_rate": 0.0001663719430227186, + "loss": 0.7676, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.37972005206251447, + "learning_rate": 0.00016598328825710533, + "loss": 0.6844, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4187265512900597, + "learning_rate": 0.000165592860169994, + "loss": 0.7754, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.39674951422645327, + "learning_rate": 0.00016520066925426144, + "loss": 0.7476, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.43085954716816965, + "learning_rate": 0.0001648067260501611, + "loss": 0.787, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.5261687847463365, + "learning_rate": 0.0001644110411450398, + "loss": 0.8032, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.38618780370611927, + "learning_rate": 0.00016401362517305296, + "loss": 0.738, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5396863057893749, + "learning_rate": 0.00016361448881487914, + "loss": 0.843, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.4704033989607021, + "learning_rate": 0.00016321364279743266, + "loss": 0.7423, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.47406714484336154, + "learning_rate": 0.0001628110978935756, + "loss": 0.8133, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4427659708747952, + "learning_rate": 0.00016240686492182804, + "loss": 0.7881, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.42786510474475103, + "learning_rate": 0.00016200095474607753, + "loss": 0.8183, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.44815258181100043, + "learning_rate": 0.00016159337827528685, + "loss": 0.7499, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.4253238194388484, + "learning_rate": 0.0001611841464632011, + "loss": 0.7833, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4141363592408863, + "learning_rate": 0.0001607732703080532, + "loss": 0.7751, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.46951635497507826, + "learning_rate": 0.00016036076085226814, + "loss": 0.772, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4398646647277728, + "learning_rate": 0.0001599466291821666, + "loss": 0.7551, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.527082865550504, + "learning_rate": 0.0001595308864276666, + "loss": 0.7694, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.39510138889240665, + "learning_rate": 0.0001591135437619847, + "loss": 0.7693, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.48632553909689946, + "learning_rate": 0.0001586946124013354, + "loss": 0.7257, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.43376164298659, + "learning_rate": 0.0001582741036046301, + "loss": 0.7286, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4212103228080253, + "learning_rate": 0.00015785202867317407, + "loss": 0.7627, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.45252904440902125, + "learning_rate": 0.00015742839895036305, + "loss": 0.7974, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.3962314240292195, + "learning_rate": 0.00015700322582137827, + "loss": 0.7366, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4327265506871692, + "learning_rate": 0.0001565765207128805, + "loss": 0.74, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.514933329734334, + "learning_rate": 0.0001561482950927029, + "loss": 0.7318, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.41587989020386307, + "learning_rate": 0.00015571856046954285, + "loss": 0.7962, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4403701107153674, + "learning_rate": 0.00015528732839265272, + "loss": 0.7474, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.4202181716022083, + "learning_rate": 0.0001548546104515294, + "loss": 0.7706, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.4120642937438657, + "learning_rate": 0.00015442041827560274, + "loss": 0.7493, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4480337234157597, + "learning_rate": 0.00015398476353392323, + "loss": 0.725, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.5633757107881812, + "learning_rate": 0.00015354765793484834, + "loss": 0.8487, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.46406095606304976, + "learning_rate": 0.00015310911322572753, + "loss": 0.7553, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.5049068598510368, + "learning_rate": 0.000152669141192587, + "loss": 0.8129, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.40075745127741647, + "learning_rate": 0.00015222775365981273, + "loss": 0.7375, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.5024374954187978, + "learning_rate": 0.00015178496248983254, + "loss": 0.8115, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4633484981898426, + "learning_rate": 0.00015134077958279765, + "loss": 0.7774, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4162899199373428, + "learning_rate": 0.00015089521687626243, + "loss": 0.7338, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.41669489939623083, + "learning_rate": 0.000150448286344864, + "loss": 0.7608, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4326687946907688, + "learning_rate": 0.00015000000000000001, + "loss": 0.7278, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.40643779999339563, + "learning_rate": 0.00014955036988950618, + "loss": 0.7128, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.40288496941550733, + "learning_rate": 0.00014909940809733222, + "loss": 0.7538, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.42603406164838237, + "learning_rate": 0.00014864712674321734, + "loss": 0.7597, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.4688532645240747, + "learning_rate": 0.00014819353798236427, + "loss": 0.8368, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4332807404697522, + "learning_rate": 0.00014773865400511272, + "loss": 0.7325, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.476697765412092, + "learning_rate": 0.00014728248703661182, + "loss": 0.8241, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4436651962043295, + "learning_rate": 0.00014682504933649144, + "loss": 0.7151, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.45433089014881944, + "learning_rate": 0.00014636635319853275, + "loss": 0.6574, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.43375495977111883, + "learning_rate": 0.00014590641095033787, + "loss": 0.734, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.42738324793054794, + "learning_rate": 0.00014544523495299842, + "loss": 0.7318, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.40877520369426673, + "learning_rate": 0.0001449828376007636, + "loss": 0.7612, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.408704133205752, + "learning_rate": 0.0001445192313207067, + "loss": 0.7062, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5337340604069295, + "learning_rate": 0.0001440544285723915, + "loss": 0.8101, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.4144611426632878, + "learning_rate": 0.00014358844184753712, + "loss": 0.7162, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.45012679041480336, + "learning_rate": 0.00014312128366968243, + "loss": 0.7141, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.8796246533658749, + "learning_rate": 0.00014265296659384956, + "loss": 0.743, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.45709338800740185, + "learning_rate": 0.00014218350320620624, + "loss": 0.7904, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.3579803058272898, + "learning_rate": 0.0001417129061237278, + "loss": 0.7025, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.4345013477153899, + "learning_rate": 0.00014124118799385796, + "loss": 0.6787, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.48881781965391935, + "learning_rate": 0.00014076836149416887, + "loss": 0.799, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4280144158388546, + "learning_rate": 0.0001402944393320206, + "loss": 0.8191, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.39269162859411183, + "learning_rate": 0.00013981943424421932, + "loss": 0.7422, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.44400142267856024, + "learning_rate": 0.00013934335899667527, + "loss": 0.7515, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.3771035900564488, + "learning_rate": 0.00013886622638405952, + "loss": 0.7656, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.41880705442305655, + "learning_rate": 0.00013838804922946027, + "loss": 0.7471, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.46797066447683655, + "learning_rate": 0.00013790884038403795, + "loss": 0.8872, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.47955664305877593, + "learning_rate": 0.00013742861272668012, + "loss": 0.7527, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.4275337022637046, + "learning_rate": 0.00013694737916365517, + "loss": 0.752, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.447119769729004, + "learning_rate": 0.00013646515262826552, + "loss": 0.7691, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.4089115544742617, + "learning_rate": 0.0001359819460805001, + "loss": 0.6738, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.44071585635928706, + "learning_rate": 0.0001354977725066859, + "loss": 0.7376, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.5365017095624967, + "learning_rate": 0.00013501264491913906, + "loss": 0.8418, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4151943479460226, + "learning_rate": 0.0001345265763558152, + "loss": 0.7133, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.42272229703750513, + "learning_rate": 0.00013403957987995882, + "loss": 0.7689, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.41818895231074066, + "learning_rate": 0.0001335516685797525, + "loss": 0.7552, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3745276342388048, + "learning_rate": 0.00013306285556796495, + "loss": 0.7207, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.450655111288209, + "learning_rate": 0.00013257315398159864, + "loss": 0.7831, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.4494743647552362, + "learning_rate": 0.00013208257698153677, + "loss": 0.7478, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.46038560262905603, + "learning_rate": 0.00013159113775218964, + "loss": 0.7465, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.3911465389838148, + "learning_rate": 0.00013109884950114007, + "loss": 0.766, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.44758425742925684, + "learning_rate": 0.00013060572545878875, + "loss": 0.7222, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.433744583645182, + "learning_rate": 0.00013011177887799845, + "loss": 0.7889, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.44729644296451876, + "learning_rate": 0.00012961702303373795, + "loss": 0.7617, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.405956052730903, + "learning_rate": 0.00012912147122272523, + "loss": 0.7481, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.42197502885627625, + "learning_rate": 0.00012862513676307008, + "loss": 0.7914, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.4471034776086442, + "learning_rate": 0.00012812803299391628, + "loss": 0.7765, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4223092383131499, + "learning_rate": 0.00012763017327508305, + "loss": 0.7371, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.47622888335383845, + "learning_rate": 0.0001271315709867059, + "loss": 0.7389, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.44051030500926475, + "learning_rate": 0.00012663223952887723, + "loss": 0.7339, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.513880230702324, + "learning_rate": 0.00012613219232128608, + "loss": 0.8257, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4168046060852622, + "learning_rate": 0.00012563144280285741, + "loss": 0.6652, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3802093771260786, + "learning_rate": 0.00012513000443139112, + "loss": 0.7385, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.5108787524389692, + "learning_rate": 0.00012462789068320017, + "loss": 0.759, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.3876960955299403, + "learning_rate": 0.00012412511505274844, + "loss": 0.7061, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.40129654726182434, + "learning_rate": 0.00012362169105228826, + "loss": 0.6962, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.420496259550952, + "learning_rate": 0.000123117632211497, + "loss": 0.7369, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.37858645511244804, + "learning_rate": 0.00012261295207711346, + "loss": 0.6701, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4699367294388282, + "learning_rate": 0.0001221076642125742, + "loss": 0.7445, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.3798508288727862, + "learning_rate": 0.00012160178219764837, + "loss": 0.6693, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.4602324367778785, + "learning_rate": 0.00012109531962807332, + "loss": 0.7183, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4012131095306747, + "learning_rate": 0.00012058829011518896, + "loss": 0.6667, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.4726607075165429, + "learning_rate": 0.00012008070728557186, + "loss": 0.7627, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.40340330901336047, + "learning_rate": 0.00011957258478066931, + "loss": 0.7308, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.412177457263275, + "learning_rate": 0.00011906393625643244, + "loss": 0.74, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.42473218036945015, + "learning_rate": 0.00011855477538294935, + "loss": 0.7744, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4275229972291327, + "learning_rate": 0.00011804511584407763, + "loss": 0.7734, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.44905637380247715, + "learning_rate": 0.00011753497133707679, + "loss": 0.7404, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4291634918113848, + "learning_rate": 0.00011702435557223987, + "loss": 0.708, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.4091422186714648, + "learning_rate": 0.00011651328227252517, + "loss": 0.7464, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.43783368283656954, + "learning_rate": 0.00011600176517318741, + "loss": 0.7346, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4608607130020425, + "learning_rate": 0.00011548981802140848, + "loss": 0.6936, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.3768622769412308, + "learning_rate": 0.00011497745457592816, + "loss": 0.7528, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4209394222759887, + "learning_rate": 0.00011446468860667421, + "loss": 0.7921, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.4232745141703031, + "learning_rate": 0.00011395153389439233, + "loss": 0.8449, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6815490088866514, + "learning_rate": 0.00011343800423027582, + "loss": 0.8462, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4499466203511656, + "learning_rate": 0.0001129241134155949, + "loss": 0.8488, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3680896796550512, + "learning_rate": 0.00011240987526132594, + "loss": 0.6303, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.3775579981479662, + "learning_rate": 0.00011189530358778005, + "loss": 0.7493, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.387847585549817, + "learning_rate": 0.00011138041222423177, + "loss": 0.6977, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.40613518614587685, + "learning_rate": 0.00011086521500854745, + "loss": 0.6898, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.43863627708276465, + "learning_rate": 0.00011034972578681338, + "loss": 0.7185, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.3889560057701365, + "learning_rate": 0.00010983395841296348, + "loss": 0.6758, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4246062002636041, + "learning_rate": 0.00010931792674840718, + "loss": 0.7409, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.4391395598034235, + "learning_rate": 0.00010880164466165674, + "loss": 0.6846, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.45434575132168414, + "learning_rate": 0.00010828512602795462, + "loss": 0.7148, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.47366659509597836, + "learning_rate": 0.00010776838472890065, + "loss": 0.7921, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.38152089971111897, + "learning_rate": 0.00010725143465207867, + "loss": 0.7133, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.38382088281979165, + "learning_rate": 0.00010673428969068364, + "loss": 0.7264, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.4395521307482716, + "learning_rate": 0.00010621696374314807, + "loss": 0.6851, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.4206238065270516, + "learning_rate": 0.00010569947071276847, + "loss": 0.7188, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.40098244852313136, + "learning_rate": 0.00010518182450733186, + "loss": 0.7042, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4018951442478255, + "learning_rate": 0.00010466403903874176, + "loss": 0.7381, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4033163134478128, + "learning_rate": 0.00010414612822264455, + "loss": 0.7689, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.417080407377218, + "learning_rate": 0.00010362810597805526, + "loss": 0.7526, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3655296102005045, + "learning_rate": 0.0001031099862269837, + "loss": 0.7822, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3582719524840942, + "learning_rate": 0.00010259178289406011, + "loss": 0.6148, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.6193022683975565, + "learning_rate": 0.00010207350990616107, + "loss": 0.8375, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.5796649389110442, + "learning_rate": 0.0001015551811920351, + "loss": 0.7782, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.3660794304482504, + "learning_rate": 0.00010103681068192845, + "loss": 0.6737, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4020716164916512, + "learning_rate": 0.00010051841230721065, + "loss": 0.7459, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4084027884383054, + "learning_rate": 0.0001, + "loss": 0.7405, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.36290419424211623, + "learning_rate": 9.948158769278939e-05, + "loss": 0.685, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.39503822991480864, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6608, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.3579770266047511, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7083, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.4087287034567777, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7289, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4839466948497212, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7328, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3744868255230105, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6718, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.40775717632462694, + "learning_rate": 9.637189402194476e-05, + "loss": 0.762, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.4586551782279262, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6693, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.4281910470889782, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7011, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.43014491789502923, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7538, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4723560588807367, + "learning_rate": 9.430052928723153e-05, + "loss": 0.8245, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4051222703401866, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6607, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.4114137413271673, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7228, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4281954851576404, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7829, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.48289740774465234, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7031, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4028132723028769, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6898, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3980925287529609, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7098, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.3909572327359679, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7263, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.41392789312871603, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7391, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.42630666106608667, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6744, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.498629422281744, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6997, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4122424664938719, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7162, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.4913748005852001, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7293, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3536713211110254, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6023, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.420690620076411, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7566, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.38799097318034, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7254, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.46951331944016117, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7676, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.5125519464479356, + "learning_rate": 8.553531139332582e-05, + "loss": 0.8276, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.4610250816054034, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7358, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.34069998754998165, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6724, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.42034988016896346, + "learning_rate": 8.399823482681262e-05, + "loss": 0.743, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4525953625962461, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7148, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.3722953782287262, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7316, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4600152810330939, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7374, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.386769626559561, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6661, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4047703539716422, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7078, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.528647696893897, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7029, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.4660467946759398, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7477, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.39172340459603056, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6979, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.41515909712934274, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6949, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.4338402676367551, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7308, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5016467346485571, + "learning_rate": 7.839821780235168e-05, + "loss": 0.7742, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.3655294327494492, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6789, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3992235175217145, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6587, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.37379208794413066, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7055, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.46885847173208745, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7399, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.5101176189041192, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7649, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.3813931147515439, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6728, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.4015163654121684, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6798, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.43697057995490984, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7125, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.39582239698428895, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6943, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.43455119502189843, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7419, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.4743973053815229, + "learning_rate": 7.286842901329412e-05, + "loss": 0.754, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4981920198315926, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7306, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.3435811784703452, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7041, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.4676722596596705, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7233, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.4163922149187722, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7852, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.4386868522058961, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6972, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.3580208548686132, + "learning_rate": 6.988822112200156e-05, + "loss": 0.72, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.45139710327502214, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6861, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4009493880827884, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7333, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.41241573495889516, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6938, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.37551723461841824, + "learning_rate": 6.791742301846326e-05, + "loss": 0.687, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.417073555060805, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7767, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.34454543212225475, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6765, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4176076168000247, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7414, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.41253313320444646, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7218, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.36254702485396967, + "learning_rate": 6.547342364418481e-05, + "loss": 0.6998, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4532720414077359, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7028, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.38065089978604044, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6944, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4576595436457372, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7512, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.42047403152676116, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7768, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.3953060092081604, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7042, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.37263074777798094, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6913, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.4110666177393186, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6652, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4001093881378836, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7271, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.39674680837721515, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6991, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.36559532012893964, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6907, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.41818675806839695, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6753, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.46023800073649845, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7153, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.41876830239649976, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6728, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.37212098102703917, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6154, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.3886496475370002, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7295, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.37728613039090586, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6951, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3943481381878099, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6747, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4231106370387455, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6679, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.4293879013145125, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7042, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.4127396745960478, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6951, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.43868355880858784, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6889, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4203498442648587, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7767, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.35312458805384117, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7057, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4972641758757012, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.78, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.39503459426723275, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6531, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.39455726810213704, + "learning_rate": 5.31749506635086e-05, + "loss": 0.718, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.5168695038424747, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7102, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3841881282938994, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6487, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4540275111280096, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7444, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.43833529799612947, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7671, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.3945980102733958, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6655, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4149063985363043, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7909, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.49822598691621023, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7377, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5097136232515275, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6614, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.4177782773828526, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7057, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.37177402594667786, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7117, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.3685581881197926, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6873, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.46556776071888967, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7756, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.35266770886849363, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6912, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.36369085697142967, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6978, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.43709851863195315, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6757, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3829946382761662, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.692, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3497755554871654, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.643, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3574329631618023, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7318, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.3985924935783683, + "learning_rate": 4.471267160734731e-05, + "loss": 0.673, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.39976928781715837, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6978, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.36214527971925986, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7079, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.38255440557898723, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7506, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4503004187468137, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6702, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.3841330427952996, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7128, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.38175740334910613, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6739, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.6598999413896037, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7369, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.39589704447950774, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7133, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4373451771733771, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7405, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.41202339742356464, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7241, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.4147292403848051, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7396, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.39756824380394074, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7464, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.39536317275028776, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6789, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.4020658688068476, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6476, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.3852940469110496, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6535, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.37806788144113224, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6732, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4771061846246495, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7475, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4137143209238499, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6498, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.40453503233945426, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6685, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.3286796782395469, + "learning_rate": 3.638551118512089e-05, + "loss": 0.635, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.45348754416461634, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7635, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.34864467592078296, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6743, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.3548118500505629, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6693, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.35704873920277636, + "learning_rate": 3.479933074573858e-05, + "loss": 0.718, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.3420733715452814, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6425, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4488889706342674, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7249, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.44498258061855306, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6507, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.38288433771548464, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7291, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3925957323413787, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6717, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.3856360933870024, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7038, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4765791148040302, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6525, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.3910871609045509, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6281, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.33589947019240074, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6655, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.3542736538025536, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6567, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.3897033488704761, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6661, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.4349004326943216, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6375, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.36368302331411456, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6337, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4864399520526362, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.7209, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.392233826310996, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6715, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.41925245267701783, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7298, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.37658896500636835, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6181, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3882127187742822, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.7136, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4896363428181879, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7591, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.39284102468656, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.691, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.40202454679797983, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6698, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.41308583418409484, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6387, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4498832761119042, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6923, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.42557242477883767, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6601, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.37082914710964937, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7148, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.46399745598658604, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6933, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.408244883944075, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7124, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.46790313701561737, + "learning_rate": 2.451770608467432e-05, + "loss": 0.7316, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3954420659794482, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6415, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.5160446694784617, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7972, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.407372719888538, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6934, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.43382835064241637, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7157, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3667955374742887, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6437, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.36914259464831484, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6657, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3894458809663143, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6928, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.3899127756187008, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6363, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.40349842143611453, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.7303, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.5535080070125004, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.785, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4492622366179551, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6326, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4241277805722848, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6273, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.36507025077689886, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6909, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.37298271092376206, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6932, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4567341972329579, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7119, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.3626668007110665, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6939, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.39980272585394583, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6755, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.47378634820576804, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6768, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.38966727992559264, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6876, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.406480465815636, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.708, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.383192054205226, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6526, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.4357434266593055, + "learning_rate": 1.754336106761927e-05, + "loss": 0.7447, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.3716881348555911, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6277, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.431139945516892, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7149, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.34663841079830454, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.617, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4462568432993696, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6918, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3713079668658502, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6789, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.5257320001228583, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7435, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.42031223753723, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7457, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4131206466935399, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6651, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3632211990047449, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6186, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3554901112331569, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6931, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.3863062310695504, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6086, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.5859591179126187, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6622, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.34970699829024526, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6294, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.42096407697177807, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.7137, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.38622205030468887, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7311, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.4129989963301976, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6875, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.48636896826486653, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7578, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.505134167661199, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7281, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3736993059804803, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6995, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3940804048202612, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6943, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.3990774475655975, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6637, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.37042381505898464, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6543, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.40837938280234165, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6777, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.4043181426273858, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6471, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.39529515976684354, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7021, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.44449771703885715, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6829, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3931107651957264, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7135, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.39318479177043725, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6756, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3559067139522861, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6402, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.38875326655337555, + "learning_rate": 9.774976338718677e-06, + "loss": 0.702, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.4815111798863259, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6987, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.41201940639359147, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7208, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3629621830209102, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6705, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.4038965497768723, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6447, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.3762118491410858, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7129, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.3562134753536249, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6769, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4391952855141163, + "learning_rate": 8.269892311900696e-06, + "loss": 0.676, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.43833627573033557, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6823, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.39781394659560937, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6684, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.4078308661940925, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6746, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.4049144720844558, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6779, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.3536177694216831, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6594, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4077623211082068, + "learning_rate": 7.07588486868922e-06, + "loss": 0.7325, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4131997194370555, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.7321, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.578312625959774, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7399, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.4083341879476406, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6443, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.43898568804268984, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6969, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.462696237901638, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6729, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3910910585414048, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6521, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.38550254011439905, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6647, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.38144526706104076, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6762, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4118249960759194, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6232, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3615210025976185, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6161, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4127086872608901, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6476, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.43217465575323877, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6634, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.4523386106539392, + "learning_rate": 4.798689246727006e-06, + "loss": 0.7196, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4025003178681048, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6766, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.39382307042669457, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6541, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4364109328096091, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6831, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.424692957408124, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7126, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.46133510175309317, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7628, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.4145843808424795, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6789, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3634617890226472, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6781, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.389987881967966, + "learning_rate": 3.611599153858214e-06, + "loss": 0.717, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.46779116163036566, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6959, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.4602503447808005, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6954, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.40139265943375085, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7356, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.4251079728922497, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6926, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3378013421472393, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6677, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.5635726805244304, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.7779, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.400951001521151, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6798, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.3753513048860841, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6961, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3576763689201268, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6694, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3947477942729083, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6445, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.37178522809350467, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6269, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.3468678976188837, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6385, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.5028190777600934, + "learning_rate": 2.036919225091827e-06, + "loss": 0.637, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.40916482529412307, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7019, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4308003991535694, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6857, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3962208433985528, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6895, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.4191855317242016, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6972, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.3605037686158801, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6957, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.37104812216124583, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6586, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.4917061399707436, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6588, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4208216791149571, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6511, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.3498415188467899, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.575, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.42668404532742993, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.723, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.3530768189661101, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6135, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.37232146648601894, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6602, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.3458193930003985, + "learning_rate": 9.070131527609604e-07, + "loss": 0.7323, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.5055609079845153, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7539, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3779179889601207, + "learning_rate": 7.730127636723539e-07, + "loss": 0.706, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4853083100791647, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6754, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.43823452893396114, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6982, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.37408287883821567, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6649, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.48208652361982396, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7195, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.38707122975730224, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6615, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.3549767879530745, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6401, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4459824380757412, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6886, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3751741377717446, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6353, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.4888587128899068, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6977, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.553546441262832, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.7254, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4598376545696746, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6825, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.390502964804027, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6519, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4077541528941456, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6666, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.41081326066795504, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6526, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.34632048003078875, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6367, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.39446972308909783, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6707, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4391317041949358, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6925, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.42626044736103313, + "learning_rate": 4.837177080119215e-08, + "loss": 0.726, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.3745101807288713, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6656, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.36825235507652904, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6548, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.35790882557698545, + "learning_rate": 1.209367398504746e-08, + "loss": 0.639, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.5101147263030844, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6959, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3990696683134663, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.7036, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.4254641164398092, + "learning_rate": 0.0, + "loss": 0.7022, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 548779134353408.0, + "train_loss": 0.747810267162323, + "train_runtime": 9756.0827, + "train_samples_per_second": 1.025, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 548779134353408.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ab0f7ae56d5d43cc5716a8a91b0d300d3d8791d --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "v_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..af2aa63a7186a9fcf53846fe5731e09c69510983 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a272106dc68e4b6a96d4784e04d8ff9aeee98af4db7f27b1410290cfa9c1a443 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2af294c7a63c17bc6403781935c056c754d66c73 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f954291e8362b47f02bdf8c359893689eb2d775e62b6ec84a176fdb86474e51 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..231de36c6134f32fb0b237aad0a06d81d492e689 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9799427462773762, + "learning_rate": 2e-05, + "loss": 1.4708, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9651716724144156, + "learning_rate": 4e-05, + "loss": 1.4713, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8214609444640415, + "learning_rate": 6e-05, + "loss": 1.3885, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7555698733537519, + "learning_rate": 8e-05, + "loss": 1.3231, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.9791891234053539, + "learning_rate": 0.0001, + "loss": 1.192, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.2500279666276863, + "learning_rate": 0.00012, + "loss": 1.0631, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8496873478151169, + "learning_rate": 0.00014, + "loss": 1.0561, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5582296792359828, + "learning_rate": 0.00016, + "loss": 0.9542, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6141848460525696, + "learning_rate": 0.00018, + "loss": 1.0162, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.40565836326113536, + "learning_rate": 0.0002, + "loss": 0.9166, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.43961376645695954, + "learning_rate": 0.00019999458931878073, + "loss": 0.8846, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.42249345100721597, + "learning_rate": 0.0001999783578606323, + "loss": 0.901, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4997460360846223, + "learning_rate": 0.00019995130738201966, + "loss": 0.9676, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4378341620615057, + "learning_rate": 0.0001999134408101731, + "loss": 0.8422, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.4484500519813879, + "learning_rate": 0.00019986476224277165, + "loss": 0.9325, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4624548605603959, + "learning_rate": 0.00019980527694749952, + "loss": 0.9258, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.35728208889300955, + "learning_rate": 0.00019973499136147606, + "loss": 0.8349, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.411935507304102, + "learning_rate": 0.0001996539130905593, + "loss": 0.8717, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3650781718408625, + "learning_rate": 0.0001995620509085228, + "loss": 0.8646, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.39246560512723355, + "learning_rate": 0.00019945941475610623, + "loss": 0.8595, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4030410211829131, + "learning_rate": 0.0001993460157399396, + "loss": 0.8502, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3837493639747341, + "learning_rate": 0.0001992218661313415, + "loss": 0.9152, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.38342018894888275, + "learning_rate": 0.00019908697936499103, + "loss": 0.8757, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.33016580945906, + "learning_rate": 0.00019894137003747403, + "loss": 0.8253, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.3529876724009927, + "learning_rate": 0.00019878505390570362, + "loss": 0.8466, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4185663760260027, + "learning_rate": 0.00019861804788521493, + "loss": 0.8654, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.33589268180561294, + "learning_rate": 0.00019844037004833473, + "loss": 0.8265, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.3790810727528171, + "learning_rate": 0.00019825203962222572, + "loss": 0.8394, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.33991371443148344, + "learning_rate": 0.0001980530769868059, + "loss": 0.8227, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.36628591292131574, + "learning_rate": 0.00019784350367254322, + "loss": 0.849, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.34554827689022766, + "learning_rate": 0.0001976233423581255, + "loss": 0.8071, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3604273129042498, + "learning_rate": 0.0001973926168680066, + "loss": 0.845, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.33081663785399623, + "learning_rate": 0.00019715135216982798, + "loss": 0.8196, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.3343038694634959, + "learning_rate": 0.0001968995743717171, + "loss": 0.7966, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.37088659377199995, + "learning_rate": 0.00019663731071946206, + "loss": 0.8567, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.32460476013111056, + "learning_rate": 0.00019636458959356316, + "loss": 0.8176, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3165303118335729, + "learning_rate": 0.0001960814405061619, + "loss": 0.7878, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.30565580680683524, + "learning_rate": 0.00019578789409784727, + "loss": 0.778, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.30616328404606236, + "learning_rate": 0.00019548398213434007, + "loss": 0.7669, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.3372819522478746, + "learning_rate": 0.00019516973750305532, + "loss": 0.8253, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3381157784774621, + "learning_rate": 0.00019484519420954354, + "loss": 0.7837, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.37360600798509136, + "learning_rate": 0.00019451038737381077, + "loss": 0.8234, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.33012256207534557, + "learning_rate": 0.00019416535322651818, + "loss": 0.736, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.34389624056846985, + "learning_rate": 0.00019381012910506146, + "loss": 0.8622, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.288060189131907, + "learning_rate": 0.00019344475344953012, + "loss": 0.7693, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.31397585630745445, + "learning_rate": 0.00019306926579854821, + "loss": 0.8236, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3225054745417322, + "learning_rate": 0.00019268370678499533, + "loss": 0.7671, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.32794264489492814, + "learning_rate": 0.0001922881181316097, + "loss": 0.8245, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.33655424519373633, + "learning_rate": 0.00019188254264647337, + "loss": 0.8104, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.33508411660895715, + "learning_rate": 0.0001914670242183795, + "loss": 0.8186, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.39563760749889826, + "learning_rate": 0.0001910416078120832, + "loss": 0.7915, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3265114128373369, + "learning_rate": 0.0001906063394634356, + "loss": 0.7707, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3157403257540551, + "learning_rate": 0.00019016126627440237, + "loss": 0.7905, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3431574816357997, + "learning_rate": 0.00018970643640796642, + "loss": 0.7795, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.31241247865104216, + "learning_rate": 0.000189241899082916, + "loss": 0.7573, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.31433971480001943, + "learning_rate": 0.00018876770456851877, + "loss": 0.8022, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3422687306764917, + "learning_rate": 0.0001882839041790818, + "loss": 0.8084, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3333545008622585, + "learning_rate": 0.00018779055026839868, + "loss": 0.8171, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.31741446992978994, + "learning_rate": 0.00018728769622408423, + "loss": 0.7369, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.3542194582461601, + "learning_rate": 0.00018677539646179707, + "loss": 0.7742, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.32353442263225596, + "learning_rate": 0.00018625370641935129, + "loss": 0.7485, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3414106252962905, + "learning_rate": 0.00018572268255071718, + "loss": 0.8001, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3267608148988314, + "learning_rate": 0.00018518238231991218, + "loss": 0.7764, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3367374059916079, + "learning_rate": 0.00018463286419478255, + "loss": 0.7331, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.3213805771349721, + "learning_rate": 0.00018407418764067627, + "loss": 0.8088, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.37163104509465206, + "learning_rate": 0.00018350641311400812, + "loss": 0.7812, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.32955187643987766, + "learning_rate": 0.0001829296020557174, + "loss": 0.7802, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.29860414347032455, + "learning_rate": 0.00018234381688461942, + "loss": 0.7319, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.31375066497346393, + "learning_rate": 0.0001817491209906506, + "loss": 0.7866, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.3188775915788622, + "learning_rate": 0.00018114557872800905, + "loss": 0.746, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3260432484403654, + "learning_rate": 0.00018053325540819045, + "loss": 0.8036, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3155014669529756, + "learning_rate": 0.0001799122172929206, + "loss": 0.7851, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.30508645394256523, + "learning_rate": 0.00017928253158698473, + "loss": 0.7718, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.2923922402971162, + "learning_rate": 0.0001786442664309554, + "loss": 0.7763, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.29774460327571245, + "learning_rate": 0.0001779974908938184, + "loss": 0.7716, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.37219652290163485, + "learning_rate": 0.0001773422749654988, + "loss": 0.6968, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.327765713563282, + "learning_rate": 0.00017667868954928694, + "loss": 0.768, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.30579480239653173, + "learning_rate": 0.00017600680645416583, + "loss": 0.7224, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.35549030323617814, + "learning_rate": 0.00017532669838704035, + "loss": 0.7889, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.30604310949701513, + "learning_rate": 0.00017463843894486937, + "loss": 0.7344, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3199842246271541, + "learning_rate": 0.0001739421026067017, + "loss": 0.7592, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3188264146047309, + "learning_rate": 0.00017323776472561627, + "loss": 0.7187, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.30584368371132103, + "learning_rate": 0.00017252550152056795, + "loss": 0.7595, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.34947093817612324, + "learning_rate": 0.0001718053900681397, + "loss": 0.7917, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.3129331216116529, + "learning_rate": 0.00017107750829420176, + "loss": 0.7874, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.32186276541072933, + "learning_rate": 0.00017034193496547902, + "loss": 0.737, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.3206007968349657, + "learning_rate": 0.00016959874968102735, + "loss": 0.6785, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3440231116205369, + "learning_rate": 0.00016884803286362, + "loss": 0.7705, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3038874492251255, + "learning_rate": 0.00016808986575104465, + "loss": 0.7337, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.3130447119031358, + "learning_rate": 0.00016732433038731242, + "loss": 0.769, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.34626456768156516, + "learning_rate": 0.0001665515096137797, + "loss": 0.7398, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.2930574912319025, + "learning_rate": 0.00016577148706018328, + "loss": 0.7161, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3077872285049125, + "learning_rate": 0.00016498434713559088, + "loss": 0.7603, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3276012866868929, + "learning_rate": 0.00016419017501926656, + "loss": 0.7585, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.3576543436577549, + "learning_rate": 0.0001633890566514535, + "loss": 0.7851, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.33090749089696053, + "learning_rate": 0.00016258107872407375, + "loss": 0.7873, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3125775160031755, + "learning_rate": 0.0001617663286713474, + "loss": 0.7721, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.312867542143936, + "learning_rate": 0.00016094489466033043, + "loss": 0.7688, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.33590251445511526, + "learning_rate": 0.00016011686558137448, + "loss": 0.7539, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.2973307464223306, + "learning_rate": 0.0001592823310385073, + "loss": 0.7521, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3152236348770068, + "learning_rate": 0.0001584413813397364, + "loss": 0.7154, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3113881739746379, + "learning_rate": 0.00015759410748727662, + "loss": 0.7717, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.31332601927708326, + "learning_rate": 0.00015674060116770236, + "loss": 0.7288, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3397860539559529, + "learning_rate": 0.00015588095474202595, + "loss": 0.7561, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.323198798835529, + "learning_rate": 0.00015501526123570277, + "loss": 0.7466, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.30623823493923746, + "learning_rate": 0.00015414361432856475, + "loss": 0.727, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.37285660237595886, + "learning_rate": 0.0001532661083446829, + "loss": 0.7894, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3262848271857289, + "learning_rate": 0.00015238283824216015, + "loss": 0.7651, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3620365495484864, + "learning_rate": 0.00015149389960285558, + "loss": 0.7745, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.3057218497192761, + "learning_rate": 0.00015059938862204127, + "loss": 0.7385, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.29327000158218175, + "learning_rate": 0.00014969940209799248, + "loss": 0.7058, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3054427550394877, + "learning_rate": 0.00014879403742151283, + "loss": 0.7432, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3408694429782927, + "learning_rate": 0.00014788339256539544, + "loss": 0.7762, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3392121321032056, + "learning_rate": 0.0001469675660738206, + "loss": 0.7593, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.3360555533470809, + "learning_rate": 0.00014604665705169237, + "loss": 0.6859, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3031735858575293, + "learning_rate": 0.00014512076515391375, + "loss": 0.7343, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.3571389466995919, + "learning_rate": 0.00014418999057460276, + "loss": 0.7475, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.31507833917832595, + "learning_rate": 0.0001432544340362501, + "loss": 0.7129, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.349220769192541, + "learning_rate": 0.00014231419677881966, + "loss": 0.7594, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.2992587393857802, + "learning_rate": 0.00014136938054879283, + "loss": 0.6826, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3354232604768008, + "learning_rate": 0.00014042008758815818, + "loss": 0.8029, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.30436303254090713, + "learning_rate": 0.00013946642062334766, + "loss": 0.7381, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.29220299250608445, + "learning_rate": 0.00013850848285411994, + "loss": 0.7514, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3445834090727366, + "learning_rate": 0.000137546377942393, + "loss": 0.8141, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.31361830449707406, + "learning_rate": 0.00013658021000102636, + "loss": 0.7502, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3012805337810813, + "learning_rate": 0.00013561008358255468, + "loss": 0.7005, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.33491038256274097, + "learning_rate": 0.00013463610366787392, + "loss": 0.7707, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.2928412264150717, + "learning_rate": 0.00013365837565488064, + "loss": 0.7496, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.2973820426225998, + "learning_rate": 0.0001326770053470668, + "loss": 0.7435, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.33198600910882076, + "learning_rate": 0.0001316920989420703, + "loss": 0.7374, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.30397289758816454, + "learning_rate": 0.00013070376302018287, + "loss": 0.731, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3275161096163156, + "learning_rate": 0.00012971210453281674, + "loss": 0.7701, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.30942657238708327, + "learning_rate": 0.000128717230790931, + "loss": 0.7617, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.33949532944889904, + "learning_rate": 0.00012771924945341906, + "loss": 0.7467, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.3326053632110846, + "learning_rate": 0.00012671826851545851, + "loss": 0.7319, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.33610166939644226, + "learning_rate": 0.0001257143962968246, + "loss": 0.7343, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3341282015985498, + "learning_rate": 0.00012470774143016853, + "loss": 0.7447, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.30529532490111727, + "learning_rate": 0.00012369841284926188, + "loss": 0.7017, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.30194530957503607, + "learning_rate": 0.00012268651977720866, + "loss": 0.6978, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.30784297599221105, + "learning_rate": 0.00012167217171462566, + "loss": 0.7004, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.30632096068901793, + "learning_rate": 0.0001206554784277931, + "loss": 0.6827, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.33150651532281467, + "learning_rate": 0.00011963654993677645, + "loss": 0.7397, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3053637242286552, + "learning_rate": 0.00011861549650352069, + "loss": 0.7533, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3442857177860226, + "learning_rate": 0.00011759242861991855, + "loss": 0.755, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.3012900869853622, + "learning_rate": 0.00011656745699585371, + "loss": 0.7232, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3432431419991961, + "learning_rate": 0.00011554069254722051, + "loss": 0.7128, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.29882942062533513, + "learning_rate": 0.00011451224638392129, + "loss": 0.7693, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.41375178194456913, + "learning_rate": 0.00011348222979784289, + "loss": 0.8417, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.308129486439542, + "learning_rate": 0.00011245075425081328, + "loss": 0.7332, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.2807674738390264, + "learning_rate": 0.00011141793136253986, + "loss": 0.7218, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3078248328225141, + "learning_rate": 0.0001103838728985307, + "loss": 0.6961, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.290270799196629, + "learning_rate": 0.000109348690758, + "loss": 0.7034, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.33081309037600143, + "learning_rate": 0.00010831249696175918, + "loss": 0.6991, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.31433887184833464, + "learning_rate": 0.0001072754036400944, + "loss": 0.7441, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.3286954369396406, + "learning_rate": 0.00010623752302063283, + "loss": 0.7037, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.31183061999915007, + "learning_rate": 0.00010519896741619803, + "loss": 0.7096, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.30613772263369227, + "learning_rate": 0.00010415984921265609, + "loss": 0.75, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.29338217061262856, + "learning_rate": 0.00010312028085675391, + "loss": 0.7616, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.38267250553954885, + "learning_rate": 0.00010208037484395114, + "loss": 0.7247, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.29705960921210556, + "learning_rate": 0.00010104024370624644, + "loss": 0.7251, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.30418653265151313, + "learning_rate": 0.0001, + "loss": 0.7422, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2766402451030531, + "learning_rate": 9.895975629375359e-05, + "loss": 0.6737, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.27765442045055555, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7201, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.33469021959761414, + "learning_rate": 9.687971914324607e-05, + "loss": 0.702, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.3007870516067576, + "learning_rate": 9.584015078734395e-05, + "loss": 0.716, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3086653479924654, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7244, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.31949221536573363, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7437, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.31639220416180613, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7519, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.34356161115139805, + "learning_rate": 9.168750303824084e-05, + "loss": 0.697, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.29377139329903523, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7182, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.3061399495541095, + "learning_rate": 8.961612710146934e-05, + "loss": 0.698, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3350824147892866, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7043, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3162601083552609, + "learning_rate": 8.754924574918675e-05, + "loss": 0.6629, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.29888480684890745, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7368, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.36898201481823084, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7991, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.32071719760353296, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7005, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.34119737881585444, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7253, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3097059116211661, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7352, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.2985655349964369, + "learning_rate": 8.138450349647936e-05, + "loss": 0.6863, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.3099795875381943, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7238, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.2960209581200614, + "learning_rate": 7.934452157220694e-05, + "loss": 0.6965, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3447576819640971, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7548, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.29867157227468866, + "learning_rate": 7.731348022279134e-05, + "loss": 0.672, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.2956036416108182, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7172, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.3177288874362381, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7173, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.30018945978996536, + "learning_rate": 7.428560370317542e-05, + "loss": 0.6953, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.30053953471137596, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7147, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.35338506088450466, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7443, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3080546389515865, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7107, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.33340506284983606, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7401, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.30467308149483496, + "learning_rate": 6.929623697981718e-05, + "loss": 0.6992, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.302039988507991, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7113, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.29675341284227075, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7337, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.28326891243017777, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7095, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.2841228427758416, + "learning_rate": 6.536389633212609e-05, + "loss": 0.712, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.2791429263057307, + "learning_rate": 6.43899164174453e-05, + "loss": 0.6982, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.33520026689446214, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7683, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.2902002710909738, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7006, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.293451022306754, + "learning_rate": 6.149151714588009e-05, + "loss": 0.6983, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.30046635314909986, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6992, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3233258402376756, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7009, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.2980725039304043, + "learning_rate": 5.863061945120719e-05, + "loss": 0.6507, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.2954458399003553, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7199, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3570840084902058, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.674, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.3101295016619717, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.703, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.30749681984973637, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7338, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.3291093600999409, + "learning_rate": 5.395334294830765e-05, + "loss": 0.746, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.2720958629997114, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.6911, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.30753918488924453, + "learning_rate": 5.211660743460458e-05, + "loss": 0.6842, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.3216825680175012, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7591, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.29659000572707517, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7325, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3962913645918868, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7051, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3026672237063192, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7115, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.31177627875961095, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7362, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.27031323704065807, + "learning_rate": 4.673389165531714e-05, + "loss": 0.6999, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3080115048245897, + "learning_rate": 4.585638567143529e-05, + "loss": 0.6881, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.26336675222056166, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6917, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.2977353273084597, + "learning_rate": 4.411904525797408e-05, + "loss": 0.6918, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.2812550180221224, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7328, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.2906984244863733, + "learning_rate": 4.240589251272342e-05, + "loss": 0.6946, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.30052517518374744, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7149, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.30688289620855186, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7324, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3054582615261841, + "learning_rate": 3.988313441862553e-05, + "loss": 0.7339, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.2944328823100095, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7185, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.3079056820382457, + "learning_rate": 3.823367132865265e-05, + "loss": 0.6519, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.30674447305985186, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7145, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3001577932422797, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.6646, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.27810720745161477, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7027, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.2670136114510044, + "learning_rate": 3.501565286440914e-05, + "loss": 0.6764, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.26833863371496064, + "learning_rate": 3.422851293981676e-05, + "loss": 0.6835, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3232697533624734, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.6975, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.28561088539631885, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7052, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.2811361506677406, + "learning_rate": 3.191013424895536e-05, + "loss": 0.6856, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.27587824227529845, + "learning_rate": 3.115196713638e-05, + "loss": 0.653, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.2727589576041019, + "learning_rate": 3.040125031897264e-05, + "loss": 0.6707, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.2790043641300064, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.6442, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3114250313351382, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7008, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.28659185546993066, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.6823, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.333270423139455, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7449, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.3015137026076761, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.6864, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.2967636116610985, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.6717, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.29290983244483276, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6942, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.33090797220012547, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.707, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.31240549536269197, + "learning_rate": 2.399319354583418e-05, + "loss": 0.6916, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.33318737481985666, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7472, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.2991242050339852, + "learning_rate": 2.265772503450122e-05, + "loss": 0.6865, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.2869364055447715, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.6853, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.28846836401603, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.6878, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3751675371753753, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7167, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.2946460631091525, + "learning_rate": 2.008778270707944e-05, + "loss": 0.6625, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3191164954661919, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7096, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.2795783534552219, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.69, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.29593212497721605, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.6903, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.31815489083177395, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6864, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.2966930688795833, + "learning_rate": 1.707039794428259e-05, + "loss": 0.6927, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.298021896337788, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6759, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3025964272635615, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6929, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3548545029837512, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7519, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.2849756804123775, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.652, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.2737573510609918, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.6571, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.29796463169165993, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.6516, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3038486218868143, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7304, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3283761528592076, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7299, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3228101439752998, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7221, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.28615791551135733, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6857, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.2798346418152692, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.67, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.2803680727826389, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.6805, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.300954219778753, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7037, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.277934902739733, + "learning_rate": 9.838733725597615e-06, + "loss": 0.6681, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.31512482967960964, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7092, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.28599544129060733, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7065, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.27602944028769505, + "learning_rate": 8.532975781620512e-06, + "loss": 0.6853, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3267933089852728, + "learning_rate": 8.117457353526625e-06, + "loss": 0.6858, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.313459679429886, + "learning_rate": 7.711881868390291e-06, + "loss": 0.6839, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.3001825550345055, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.6879, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.31637271766164904, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7064, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.31838626264793163, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7455, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3074064029178275, + "learning_rate": 6.189870894938587e-06, + "loss": 0.6752, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.304961915963322, + "learning_rate": 5.834646773481811e-06, + "loss": 0.6734, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.27696421295588336, + "learning_rate": 5.489612626189245e-06, + "loss": 0.682, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.29539939286952366, + "learning_rate": 5.154805790456485e-06, + "loss": 0.6274, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.30824123193852787, + "learning_rate": 4.830262496944693e-06, + "loss": 0.6613, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.30298284813410736, + "learning_rate": 4.516017865659949e-06, + "loss": 0.7081, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.38531225841242817, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6766, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.33880034089518074, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7444, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.2866062477084964, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.6876, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.31917192539126965, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7148, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.33612170964556043, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7244, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.2795209986859653, + "learning_rate": 2.848647830172024e-06, + "loss": 0.6897, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.34555833343813835, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7329, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.273768301964365, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.6903, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3008109477758345, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6472, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.272451739640845, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6455, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.30995766008558534, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7038, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.31021019826393165, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7032, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.279226548216852, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.6872, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.2997198731268311, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6625, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.2895908073868522, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6554, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.26714971443515456, + "learning_rate": 9.130206350089765e-07, + "loss": 0.6454, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.33520987293435356, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7539, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.31608583930661005, + "learning_rate": 6.539842600603918e-07, + "loss": 0.7005, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3033775341477071, + "learning_rate": 5.405852438937764e-07, + "loss": 0.6894, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.32634303773898465, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7045, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.28748004629627927, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6712, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.31678645523713805, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.6722, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.31000107831499757, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7094, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.2781674506548415, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.6657, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.26616307698787445, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6561, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3176419427961581, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.6898, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.2831916446551315, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7033, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.256439033754437, + "learning_rate": 5.410681219286673e-09, + "loss": 0.656, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3055519184450505, + "learning_rate": 0.0, + "loss": 0.7088, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 802271864946688.0, + "train_loss": 0.7511108540571653, + "train_runtime": 9734.5246, + "train_samples_per_second": 1.027, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 802271864946688.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e273ae28008be5e0a49b27d582dd67ebc2779991 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "o_proj", + "up_proj", + "gate_proj", + "q_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4a9f958052539812c70a1b002544ce6ffd4f4f7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5c6b66af35ec9a9f7103ad1187b72969b06a01aac9ff7e94da2272f6a0f877b +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2dcef3fc36b8721ca0fb05c44e38f373b7fbb95b --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cf8cc0b1167f652ce9945a63057d9d7de52436c1f5aa0a53e58ca12f88d7d1e +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c6d99a977c1bd0ca7f89bd3f2b82c39adc2e8c5 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 1.0374368174554673, + "learning_rate": 5.263157894736842e-06, + "loss": 1.4765, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.9647907368918112, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5275, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 1.0687221015355828, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.4583, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 1.1073894659931591, + "learning_rate": 2.105263157894737e-05, + "loss": 1.5343, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.872939750982351, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.4482, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.7827606006770655, + "learning_rate": 3.157894736842105e-05, + "loss": 1.2609, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.8767643959762481, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.1594, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9400756856208291, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1574, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.8173025011927325, + "learning_rate": 4.736842105263158e-05, + "loss": 1.1197, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 1.024212238400255, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.9813, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.8810021393418611, + "learning_rate": 5.789473684210527e-05, + "loss": 1.0469, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8402997948170003, + "learning_rate": 6.31578947368421e-05, + "loss": 0.9713, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8377451249568147, + "learning_rate": 6.842105263157895e-05, + "loss": 0.9829, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7458242310989486, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9791, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.7818629204315781, + "learning_rate": 7.894736842105263e-05, + "loss": 1.0254, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.577254793980875, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9164, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.5196810929009288, + "learning_rate": 8.947368421052632e-05, + "loss": 0.8945, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5833759720354792, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8972, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.5250700596142646, + "learning_rate": 0.0001, + "loss": 0.8949, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.5768087471670845, + "learning_rate": 0.00010526315789473685, + "loss": 0.9445, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6046200078061305, + "learning_rate": 0.0001105263157894737, + "loss": 0.8924, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5472188660850493, + "learning_rate": 0.00011578947368421053, + "loss": 0.8678, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5140068334871463, + "learning_rate": 0.00012105263157894738, + "loss": 0.9087, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5778034013843734, + "learning_rate": 0.0001263157894736842, + "loss": 0.8774, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.5291394827107463, + "learning_rate": 0.00013157894736842108, + "loss": 0.8625, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5334723147989654, + "learning_rate": 0.0001368421052631579, + "loss": 0.8784, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.6504839292539988, + "learning_rate": 0.00014210526315789474, + "loss": 0.8903, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4872484533928141, + "learning_rate": 0.00014736842105263158, + "loss": 0.8226, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.5109899744706795, + "learning_rate": 0.00015263157894736845, + "loss": 0.8469, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.5177464368126972, + "learning_rate": 0.00015789473684210527, + "loss": 0.8804, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.43544748073661604, + "learning_rate": 0.0001631578947368421, + "loss": 0.8103, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.47195580204436993, + "learning_rate": 0.00016842105263157895, + "loss": 0.8169, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.5172764639850093, + "learning_rate": 0.0001736842105263158, + "loss": 0.8867, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5374199599281689, + "learning_rate": 0.00017894736842105264, + "loss": 0.8723, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.47035134318751337, + "learning_rate": 0.00018421052631578948, + "loss": 0.8921, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5020079958011039, + "learning_rate": 0.00018947368421052632, + "loss": 0.9284, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.4757829581513172, + "learning_rate": 0.00019473684210526317, + "loss": 0.8465, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.48388800543752086, + "learning_rate": 0.0002, + "loss": 0.8578, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.49506459150484233, + "learning_rate": 0.00019999966405802826, + "loss": 0.8702, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.5956315840315756, + "learning_rate": 0.00019999865623437013, + "loss": 0.8123, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.467793612449344, + "learning_rate": 0.00019999697653579705, + "loss": 0.8314, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5048076691281702, + "learning_rate": 0.00019999462497359466, + "loss": 0.8696, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.40294387287403305, + "learning_rate": 0.0001999916015635627, + "loss": 0.798, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.46854643898417353, + "learning_rate": 0.00019998790632601496, + "loss": 0.8103, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.49336650140129507, + "learning_rate": 0.00019998353928577919, + "loss": 0.8102, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.47200531080017466, + "learning_rate": 0.0001999785004721968, + "loss": 0.8095, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.5224134927367048, + "learning_rate": 0.0001999727899191228, + "loss": 0.9128, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.500553057639183, + "learning_rate": 0.00019996640766492543, + "loss": 0.8842, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.5027943918318422, + "learning_rate": 0.00019995935375248606, + "loss": 0.8701, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.45907904496011853, + "learning_rate": 0.00019995162822919883, + "loss": 0.7932, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.5887996842150158, + "learning_rate": 0.00019994323114697022, + "loss": 0.8548, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4401221656214865, + "learning_rate": 0.00019993416256221895, + "loss": 0.7761, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.45742857334650894, + "learning_rate": 0.0001999244225358753, + "loss": 0.8287, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4594651521945782, + "learning_rate": 0.00019991401113338104, + "loss": 0.8803, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.4751273716695581, + "learning_rate": 0.00019990292842468868, + "loss": 0.8337, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5194707687606167, + "learning_rate": 0.00019989117448426108, + "loss": 0.8136, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.46889516421407623, + "learning_rate": 0.0001998787493910712, + "loss": 0.8388, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.49447907478543895, + "learning_rate": 0.00019986565322860115, + "loss": 0.8274, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.4982392434002533, + "learning_rate": 0.000199851886084842, + "loss": 0.7147, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.49653869839949516, + "learning_rate": 0.00019983744805229296, + "loss": 0.813, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.46503031036415227, + "learning_rate": 0.00019982233922796085, + "loss": 0.8348, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.4553314478017029, + "learning_rate": 0.00019980655971335945, + "loss": 0.8459, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.5099452510535164, + "learning_rate": 0.00019979010961450878, + "loss": 0.8437, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4450900508201653, + "learning_rate": 0.00019977298904193437, + "loss": 0.7594, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.5139840949808181, + "learning_rate": 0.00019975519811066663, + "loss": 0.8131, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4537897303649624, + "learning_rate": 0.00019973673694024, + "loss": 0.8247, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.4793891004260735, + "learning_rate": 0.0001997176056546921, + "loss": 0.7717, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4601117089424223, + "learning_rate": 0.00019969780438256293, + "loss": 0.8088, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.5067495430657267, + "learning_rate": 0.0001996773332568941, + "loss": 0.8491, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.42039937044715653, + "learning_rate": 0.0001996561924152278, + "loss": 0.7833, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.50716358892272, + "learning_rate": 0.00019963438199960599, + "loss": 0.8387, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4437233808887912, + "learning_rate": 0.0001996119021565693, + "loss": 0.7922, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.49349683185308396, + "learning_rate": 0.00019958875303715615, + "loss": 0.7851, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4989796856683146, + "learning_rate": 0.0001995649347969019, + "loss": 0.7571, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.5592873172818614, + "learning_rate": 0.0001995404475958373, + "loss": 0.8056, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.48178379099074353, + "learning_rate": 0.00019951529159848805, + "loss": 0.8195, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.46034118570648497, + "learning_rate": 0.0001994894669738732, + "loss": 0.7944, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4792276771943578, + "learning_rate": 0.00019946297389550433, + "loss": 0.7781, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.4905112294899911, + "learning_rate": 0.0001994358125413841, + "loss": 0.8404, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.511027253538957, + "learning_rate": 0.00019940798309400526, + "loss": 0.8649, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.4502231765028985, + "learning_rate": 0.0001993794857403495, + "loss": 0.8047, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.4729558428131548, + "learning_rate": 0.0001993503206718859, + "loss": 0.7284, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.551203178138253, + "learning_rate": 0.0001993204880845699, + "loss": 0.9007, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4955467240801091, + "learning_rate": 0.00019928998817884182, + "loss": 0.8179, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.4973043930768317, + "learning_rate": 0.00019925882115962568, + "loss": 0.7517, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5749059417258057, + "learning_rate": 0.00019922698723632767, + "loss": 0.917, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.5071055341489035, + "learning_rate": 0.00019919448662283478, + "loss": 0.8924, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.46257737471037086, + "learning_rate": 0.00019916131953751342, + "loss": 0.7609, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.44168783280802665, + "learning_rate": 0.00019912748620320794, + "loss": 0.814, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.5454741570250863, + "learning_rate": 0.00019909298684723904, + "loss": 0.8656, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.45261852211312303, + "learning_rate": 0.00019905782170140238, + "loss": 0.8285, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.49841285496121795, + "learning_rate": 0.00019902199100196697, + "loss": 0.8088, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.48386523753794336, + "learning_rate": 0.00019898549498967343, + "loss": 0.8061, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.48053831208676606, + "learning_rate": 0.00019894833390973266, + "loss": 0.7484, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.4771364223196281, + "learning_rate": 0.000198910508011824, + "loss": 0.811, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4970171280528449, + "learning_rate": 0.00019887201755009357, + "loss": 0.7632, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.4687893128093611, + "learning_rate": 0.00019883286278315262, + "loss": 0.8255, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.45499599756252357, + "learning_rate": 0.0001987930439740757, + "loss": 0.8242, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.45447226273451646, + "learning_rate": 0.00019875256139039902, + "loss": 0.8082, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.47297411650227233, + "learning_rate": 0.00019871141530411853, + "loss": 0.8291, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.40269649053074463, + "learning_rate": 0.00019866960599168826, + "loss": 0.7669, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5113495244968219, + "learning_rate": 0.0001986271337340182, + "loss": 0.8032, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.4327241663791887, + "learning_rate": 0.0001985839988164726, + "loss": 0.7878, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5038311447644328, + "learning_rate": 0.00019854020152886814, + "loss": 0.8663, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.4849232659450358, + "learning_rate": 0.00019849574216547171, + "loss": 0.801, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.4681459892369315, + "learning_rate": 0.0001984506210249986, + "loss": 0.8459, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.4360285161996867, + "learning_rate": 0.00019840483841061058, + "loss": 0.7529, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5390379077321039, + "learning_rate": 0.00019835839462991361, + "loss": 0.8735, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.48479802014409695, + "learning_rate": 0.00019831128999495606, + "loss": 0.8477, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.4057026106248144, + "learning_rate": 0.00019826352482222638, + "loss": 0.7647, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.44929090068815086, + "learning_rate": 0.0001982150994326511, + "loss": 0.8172, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5354767704232606, + "learning_rate": 0.00019816601415159263, + "loss": 0.7891, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.48156568074510114, + "learning_rate": 0.0001981162693088471, + "loss": 0.7942, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4928990146899524, + "learning_rate": 0.0001980658652386421, + "loss": 0.7533, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.5409413287368926, + "learning_rate": 0.0001980148022796345, + "loss": 0.7595, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.5430691232147374, + "learning_rate": 0.00019796308077490817, + "loss": 0.7757, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.45766800539177555, + "learning_rate": 0.00019791070107197153, + "loss": 0.7872, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.48224015934935577, + "learning_rate": 0.00019785766352275542, + "loss": 0.7849, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.5128445310359344, + "learning_rate": 0.0001978039684836106, + "loss": 0.8622, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.4791435694937396, + "learning_rate": 0.00019774961631530545, + "loss": 0.8156, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.48277627373747767, + "learning_rate": 0.0001976946073830234, + "loss": 0.841, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.435256417777546, + "learning_rate": 0.00019763894205636072, + "loss": 0.833, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.423589901414426, + "learning_rate": 0.00019758262070932375, + "loss": 0.7381, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5259314780181126, + "learning_rate": 0.00019752564372032657, + "loss": 0.7611, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.5738789466383943, + "learning_rate": 0.00019746801147218842, + "loss": 0.84, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4461538377556106, + "learning_rate": 0.00019740972435213115, + "loss": 0.7562, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 1.1937291424727432, + "learning_rate": 0.00019735078275177654, + "loss": 0.8755, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.6228342628411248, + "learning_rate": 0.00019729118706714375, + "loss": 0.884, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.6214472402814468, + "learning_rate": 0.00019723093769864663, + "loss": 0.8182, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.4557486063355505, + "learning_rate": 0.00019717003505109095, + "loss": 0.7706, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.4812066863375617, + "learning_rate": 0.0001971084795336719, + "loss": 0.8316, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.447456871434426, + "learning_rate": 0.00019704627155997108, + "loss": 0.7411, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.39977666970703024, + "learning_rate": 0.00019698341154795389, + "loss": 0.7433, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.43667315089753594, + "learning_rate": 0.00019691989991996663, + "loss": 0.7684, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.4494756970444253, + "learning_rate": 0.00019685573710273376, + "loss": 0.7688, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.49026027382847054, + "learning_rate": 0.0001967909235273549, + "loss": 0.9187, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.47684851425475017, + "learning_rate": 0.00019672545962930215, + "loss": 0.8942, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.45422995643590125, + "learning_rate": 0.00019665934584841682, + "loss": 0.83, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.4020057030148452, + "learning_rate": 0.00019659258262890683, + "loss": 0.7248, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.4503058589261113, + "learning_rate": 0.00019652517041934356, + "loss": 0.7878, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.7534723512494698, + "learning_rate": 0.00019645710967265882, + "loss": 0.8465, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5399174271022059, + "learning_rate": 0.00019638840084614182, + "loss": 0.7768, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.48574380251997074, + "learning_rate": 0.00019631904440143612, + "loss": 0.7802, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5910788511465598, + "learning_rate": 0.00019624904080453655, + "loss": 0.799, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.42498860003951533, + "learning_rate": 0.00019617839052578603, + "loss": 0.7758, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4348223664639345, + "learning_rate": 0.00019610709403987246, + "loss": 0.7638, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.4422740800075286, + "learning_rate": 0.0001960351518258255, + "loss": 0.7493, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4358249700796869, + "learning_rate": 0.00019596256436701324, + "loss": 0.7581, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.5218921014346616, + "learning_rate": 0.00019588933215113926, + "loss": 0.7822, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.5028643589840232, + "learning_rate": 0.000195815455670239, + "loss": 0.7344, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.49755928745931766, + "learning_rate": 0.00019574093542067673, + "loss": 0.8326, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.568496585050559, + "learning_rate": 0.00019566577190314197, + "loss": 0.8215, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.429493869454417, + "learning_rate": 0.0001955899656226464, + "loss": 0.8248, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.9825550664511603, + "learning_rate": 0.0001955135170885202, + "loss": 0.8457, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.5113421864776261, + "learning_rate": 0.0001954364268144088, + "loss": 0.8057, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5357210696781818, + "learning_rate": 0.00019535869531826937, + "loss": 0.8161, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.5257837365454168, + "learning_rate": 0.00019528032312236736, + "loss": 0.8191, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.6213616122058353, + "learning_rate": 0.00019520131075327298, + "loss": 0.7425, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.444592236051405, + "learning_rate": 0.00019512165874185767, + "loss": 0.7491, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.49639411717399873, + "learning_rate": 0.00019504136762329047, + "loss": 0.8507, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.39605949311819466, + "learning_rate": 0.0001949604379370345, + "loss": 0.7163, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.47945044600591147, + "learning_rate": 0.00019487887022684336, + "loss": 0.773, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.4711373930990777, + "learning_rate": 0.00019479666504075736, + "loss": 0.7228, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4691074164513893, + "learning_rate": 0.00019471382293110003, + "loss": 0.7575, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.6643970248014088, + "learning_rate": 0.0001946303444544741, + "loss": 0.8381, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5492820446210721, + "learning_rate": 0.00019454623017175812, + "loss": 0.8513, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.45992542125359726, + "learning_rate": 0.00019446148064810242, + "loss": 0.8132, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.48259927780873124, + "learning_rate": 0.00019437609645292546, + "loss": 0.7405, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.4157849474201508, + "learning_rate": 0.00019429007815990993, + "loss": 0.7764, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.4630117970569246, + "learning_rate": 0.0001942034263469989, + "loss": 0.8203, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.4767300639859073, + "learning_rate": 0.00019411614159639204, + "loss": 0.7438, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5643052333813844, + "learning_rate": 0.00019402822449454153, + "loss": 0.8728, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.5075287841776309, + "learning_rate": 0.00019393967563214833, + "loss": 0.8214, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.4520406164867501, + "learning_rate": 0.00019385049560415794, + "loss": 0.7665, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.4210038006148116, + "learning_rate": 0.00019376068500975667, + "loss": 0.7426, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.6248446036294675, + "learning_rate": 0.00019367024445236754, + "loss": 0.845, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.4643824586786272, + "learning_rate": 0.000193579174539646, + "loss": 0.7389, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4526319139574308, + "learning_rate": 0.00019348747588347637, + "loss": 0.7876, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.42928338585788617, + "learning_rate": 0.00019339514909996706, + "loss": 0.7615, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.55960583171419, + "learning_rate": 0.00019330219480944694, + "loss": 0.8021, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.40208685338228656, + "learning_rate": 0.00019320861363646095, + "loss": 0.7784, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4726464051961062, + "learning_rate": 0.00019311440620976597, + "loss": 0.7474, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.4847755436931592, + "learning_rate": 0.00019301957316232658, + "loss": 0.7826, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4591137881167212, + "learning_rate": 0.0001929241151313108, + "loss": 0.8059, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.5062833652786924, + "learning_rate": 0.0001928280327580858, + "loss": 0.8268, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.49955099266247305, + "learning_rate": 0.00019273132668821364, + "loss": 0.8585, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.4999065709478267, + "learning_rate": 0.00019263399757144683, + "loss": 0.7069, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.48090141281052196, + "learning_rate": 0.00019253604606172417, + "loss": 0.7943, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.5120636266933695, + "learning_rate": 0.000192437472817166, + "loss": 0.8641, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.46396040035960373, + "learning_rate": 0.00019233827850007027, + "loss": 0.7538, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.48099853739089377, + "learning_rate": 0.00019223846377690754, + "loss": 0.8104, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.45113176495385104, + "learning_rate": 0.00019213802931831696, + "loss": 0.7747, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.472301307522119, + "learning_rate": 0.00019203697579910154, + "loss": 0.7552, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4057783708879821, + "learning_rate": 0.00019193530389822363, + "loss": 0.7048, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.467258387306045, + "learning_rate": 0.00019183301429880043, + "loss": 0.7941, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.516487481035125, + "learning_rate": 0.00019173010768809933, + "loss": 0.7853, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.42347807802259363, + "learning_rate": 0.00019162658475753327, + "loss": 0.7649, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.8367495246429115, + "learning_rate": 0.0001915224462026563, + "loss": 0.7083, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.4525969776176922, + "learning_rate": 0.00019141769272315858, + "loss": 0.7624, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.4982254354963437, + "learning_rate": 0.00019131232502286188, + "loss": 0.8169, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.4897982009512447, + "learning_rate": 0.00019120634380971496, + "loss": 0.7375, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4721918519110925, + "learning_rate": 0.0001910997497957885, + "loss": 0.8343, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.4802560936686167, + "learning_rate": 0.0001909925436972706, + "loss": 0.7365, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.532981189533498, + "learning_rate": 0.00019088472623446183, + "loss": 0.8503, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.42372376234282055, + "learning_rate": 0.00019077629813177036, + "loss": 0.7674, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.45321557602235424, + "learning_rate": 0.00019066726011770726, + "loss": 0.8055, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.4312975492954434, + "learning_rate": 0.00019055761292488142, + "loss": 0.7738, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.43024914345350995, + "learning_rate": 0.0001904473572899947, + "loss": 0.8008, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.46450573652223054, + "learning_rate": 0.00019033649395383702, + "loss": 0.8204, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.4802445996979556, + "learning_rate": 0.00019022502366128135, + "loss": 0.7825, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.5280220525726201, + "learning_rate": 0.00019011294716127867, + "loss": 0.8355, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4655367627849839, + "learning_rate": 0.00019000026520685302, + "loss": 0.7514, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.4159172606320722, + "learning_rate": 0.0001898869785550963, + "loss": 0.7176, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4548273843508264, + "learning_rate": 0.0001897730879671634, + "loss": 0.8194, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.4942757859153581, + "learning_rate": 0.00018965859420826684, + "loss": 0.7833, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.504896542459169, + "learning_rate": 0.00018954349804767184, + "loss": 0.7522, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.4447487563249888, + "learning_rate": 0.00018942780025869098, + "loss": 0.7616, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.49632307924477864, + "learning_rate": 0.00018931150161867916, + "loss": 0.7939, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.40869032768866065, + "learning_rate": 0.00018919460290902826, + "loss": 0.8287, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.4332429743929596, + "learning_rate": 0.00018907710491516199, + "loss": 0.7404, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.43036598063291176, + "learning_rate": 0.0001889590084265304, + "loss": 0.7933, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.41630272039412397, + "learning_rate": 0.0001888403142366049, + "loss": 0.7512, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.5123312220359458, + "learning_rate": 0.0001887210231428727, + "loss": 0.8504, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5116812124880064, + "learning_rate": 0.00018860113594683148, + "loss": 0.7846, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.42195692450016364, + "learning_rate": 0.0001884806534539841, + "loss": 0.7404, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.3700439187134879, + "learning_rate": 0.00018835957647383303, + "loss": 0.6876, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.3834528350465055, + "learning_rate": 0.0001882379058198751, + "loss": 0.7907, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4616055414126435, + "learning_rate": 0.00018811564230959588, + "loss": 0.8315, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.4983638436350939, + "learning_rate": 0.00018799278676446423, + "loss": 0.7365, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.44216589035323284, + "learning_rate": 0.00018786934000992688, + "loss": 0.7756, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.4742933096255244, + "learning_rate": 0.00018774530287540278, + "loss": 0.8325, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.49045810829845043, + "learning_rate": 0.00018762067619427746, + "loss": 0.749, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.5184488152849216, + "learning_rate": 0.00018749546080389757, + "loss": 0.7795, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.500204812288988, + "learning_rate": 0.00018736965754556528, + "loss": 0.7897, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.4738561000941789, + "learning_rate": 0.00018724326726453244, + "loss": 0.8309, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3916407430344318, + "learning_rate": 0.00018711629080999504, + "loss": 0.7133, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.4518339075048928, + "learning_rate": 0.00018698872903508755, + "loss": 0.808, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.3856933180025405, + "learning_rate": 0.00018686058279687698, + "loss": 0.7658, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.4940554727867292, + "learning_rate": 0.0001867318529563574, + "loss": 0.7636, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.46136071241676646, + "learning_rate": 0.00018660254037844388, + "loss": 0.7221, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.4321513860869631, + "learning_rate": 0.00018647264593196688, + "loss": 0.7615, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4206839910688456, + "learning_rate": 0.00018634217048966637, + "loss": 0.7459, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.43391516279938624, + "learning_rate": 0.00018621111492818585, + "loss": 0.7941, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.40858977353454384, + "learning_rate": 0.0001860794801280666, + "loss": 0.769, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.4349779920862305, + "learning_rate": 0.00018594726697374175, + "loss": 0.6979, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.45598470923578976, + "learning_rate": 0.0001858144763535302, + "loss": 0.7796, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.4665716149736225, + "learning_rate": 0.0001856811091596308, + "loss": 0.7878, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4463411278050276, + "learning_rate": 0.0001855471662881164, + "loss": 0.8049, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.5077164997106266, + "learning_rate": 0.00018541264863892754, + "loss": 0.7728, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.498872148590993, + "learning_rate": 0.00018527755711586678, + "loss": 0.7527, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.4826197747486357, + "learning_rate": 0.00018514189262659235, + "loss": 0.7856, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.42919062355595305, + "learning_rate": 0.00018500565608261214, + "loss": 0.8277, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.41782538449347645, + "learning_rate": 0.00018486884839927768, + "loss": 0.6717, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.42784641824086067, + "learning_rate": 0.00018473147049577774, + "loss": 0.7196, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.4512419784378731, + "learning_rate": 0.0001845935232951325, + "loss": 0.7424, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4154593646623652, + "learning_rate": 0.00018445500772418697, + "loss": 0.7489, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.4384153798043128, + "learning_rate": 0.00018431592471360503, + "loss": 0.7495, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.408933173821226, + "learning_rate": 0.00018417627519786315, + "loss": 0.7471, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.4466848371228825, + "learning_rate": 0.000184036060115244, + "loss": 0.7023, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.5676238878157918, + "learning_rate": 0.00018389528040783012, + "loss": 0.8444, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.49940950060271694, + "learning_rate": 0.00018375393702149787, + "loss": 0.8093, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.411486098422595, + "learning_rate": 0.00018361203090591071, + "loss": 0.7161, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.37889999469972857, + "learning_rate": 0.00018346956301451304, + "loss": 0.7245, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4288283798698304, + "learning_rate": 0.00018332653430452376, + "loss": 0.7069, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.44383544195252433, + "learning_rate": 0.00018318294573692985, + "loss": 0.7502, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.48086052555298, + "learning_rate": 0.00018303879827647975, + "loss": 0.7936, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.3957322973294756, + "learning_rate": 0.0001828940928916772, + "loss": 0.7021, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4400708983479737, + "learning_rate": 0.00018274883055477436, + "loss": 0.7493, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.5131851425434888, + "learning_rate": 0.00018260301224176558, + "loss": 0.7989, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.4445777972230032, + "learning_rate": 0.00018245663893238075, + "loss": 0.8149, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.4081132952903988, + "learning_rate": 0.00018230971161007853, + "loss": 0.7559, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.40312914828746943, + "learning_rate": 0.00018216223126204007, + "loss": 0.7255, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.4993878111709934, + "learning_rate": 0.00018201419887916214, + "loss": 0.8224, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.4383548287016405, + "learning_rate": 0.00018186561545605054, + "loss": 0.7286, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.44567548374122995, + "learning_rate": 0.00018171648199101346, + "loss": 0.7281, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4292248672579036, + "learning_rate": 0.00018156679948605467, + "loss": 0.7653, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.4955153306282761, + "learning_rate": 0.00018141656894686689, + "loss": 0.8314, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.5094574028346628, + "learning_rate": 0.00018126579138282503, + "loss": 0.8197, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.4502767303537129, + "learning_rate": 0.00018111446780697929, + "loss": 0.771, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.4316637178021228, + "learning_rate": 0.0001809625992360485, + "loss": 0.728, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.5303243108120476, + "learning_rate": 0.00018081018669041324, + "loss": 0.7863, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.3893145252905154, + "learning_rate": 0.00018065723119410884, + "loss": 0.7195, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.4790203255564661, + "learning_rate": 0.00018050373377481878, + "loss": 0.7858, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.44061721980349183, + "learning_rate": 0.00018034969546386757, + "loss": 0.7219, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.49032305188970193, + "learning_rate": 0.0001801951172962139, + "loss": 0.7501, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.46994437658774807, + "learning_rate": 0.0001800400003104436, + "loss": 0.7514, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.43599573839129335, + "learning_rate": 0.0001798843455487629, + "loss": 0.7232, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4968621506667461, + "learning_rate": 0.00017972815405699103, + "loss": 0.7856, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.5334967293115999, + "learning_rate": 0.00017957142688455362, + "loss": 0.8355, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.5049633275191162, + "learning_rate": 0.00017941416508447536, + "loss": 0.7993, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.5226946501993106, + "learning_rate": 0.00017925636971337304, + "loss": 0.7614, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.46031635923303504, + "learning_rate": 0.0001790980418314484, + "loss": 0.8235, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.47470104995479906, + "learning_rate": 0.00017893918250248104, + "loss": 0.7683, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.420096335266906, + "learning_rate": 0.00017877979279382135, + "loss": 0.7614, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.5307529328489271, + "learning_rate": 0.00017861987377638312, + "loss": 0.7344, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5531361713545803, + "learning_rate": 0.0001784594265246366, + "loss": 0.7952, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.4379379434223826, + "learning_rate": 0.0001782984521166011, + "loss": 0.7881, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.47042976585887974, + "learning_rate": 0.0001781369516338378, + "loss": 0.7343, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.3992671800588768, + "learning_rate": 0.00017797492616144256, + "loss": 0.6851, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.4177986577565525, + "learning_rate": 0.00017781237678803847, + "loss": 0.7668, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.4354010399914639, + "learning_rate": 0.00017764930460576866, + "loss": 0.7041, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4462884492349542, + "learning_rate": 0.000177485710710289, + "loss": 0.7676, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.4191032529227301, + "learning_rate": 0.00017732159620076053, + "loss": 0.7693, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.45465716446320836, + "learning_rate": 0.00017715696217984235, + "loss": 0.7267, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.4457794134577356, + "learning_rate": 0.00017699180975368396, + "loss": 0.7928, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.5296231776916995, + "learning_rate": 0.00017682614003191807, + "loss": 0.8466, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.38882200427077135, + "learning_rate": 0.00017665995412765285, + "loss": 0.6935, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.43860765207791874, + "learning_rate": 0.00017649325315746478, + "loss": 0.8161, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.42531906914129314, + "learning_rate": 0.00017632603824139085, + "loss": 0.7676, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.4091747089690747, + "learning_rate": 0.0001761583105029213, + "loss": 0.7865, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.42929783579785047, + "learning_rate": 0.0001759900710689918, + "loss": 0.7917, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4278215467335946, + "learning_rate": 0.00017582132106997616, + "loss": 0.7195, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.3666058590710916, + "learning_rate": 0.00017565206163967846, + "loss": 0.7403, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.476708669261199, + "learning_rate": 0.00017548229391532572, + "loss": 0.7797, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.451915357445945, + "learning_rate": 0.00017531201903755994, + "loss": 0.8248, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4025080301017516, + "learning_rate": 0.00017514123815043074, + "loss": 0.7972, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.3706391455912095, + "learning_rate": 0.00017496995240138744, + "loss": 0.6941, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4217817302490733, + "learning_rate": 0.00017479816294127152, + "loss": 0.737, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.435246810546811, + "learning_rate": 0.00017462587092430875, + "loss": 0.7746, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.42681516551875237, + "learning_rate": 0.0001744530775081015, + "loss": 0.7491, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.48731269462670573, + "learning_rate": 0.00017427978385362112, + "loss": 0.7313, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4280493549246099, + "learning_rate": 0.0001741059911251997, + "loss": 0.7517, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.4106403555591145, + "learning_rate": 0.0001739317004905227, + "loss": 0.6978, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.47468879663103564, + "learning_rate": 0.000173756913120621, + "loss": 0.8465, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.45988680617556915, + "learning_rate": 0.00017358163018986282, + "loss": 0.819, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.42180705245581873, + "learning_rate": 0.00017340585287594604, + "loss": 0.7365, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.4376924090800282, + "learning_rate": 0.00017322958235989016, + "loss": 0.6623, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.48363351015083733, + "learning_rate": 0.0001730528198260285, + "loss": 0.7363, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.3942595478028239, + "learning_rate": 0.00017287556646200018, + "loss": 0.7566, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.3933448029539557, + "learning_rate": 0.00017269782345874203, + "loss": 0.6969, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.434616179162153, + "learning_rate": 0.00017251959201048083, + "loss": 0.7609, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.39288081687478416, + "learning_rate": 0.00017234087331472497, + "loss": 0.728, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.46607832581942166, + "learning_rate": 0.00017216166857225674, + "loss": 0.8335, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.43282334279022616, + "learning_rate": 0.00017198197898712404, + "loss": 0.7501, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.5417948957602462, + "learning_rate": 0.00017180180576663228, + "loss": 0.7462, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.467210018753779, + "learning_rate": 0.00017162115012133643, + "loss": 0.7991, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.4543489707663741, + "learning_rate": 0.00017144001326503273, + "loss": 0.733, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4932319707043495, + "learning_rate": 0.00017125839641475072, + "loss": 0.7418, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.4537239614953879, + "learning_rate": 0.00017107630079074478, + "loss": 0.7782, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.5698675499674541, + "learning_rate": 0.00017089372761648616, + "loss": 0.7718, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.4011892818050699, + "learning_rate": 0.00017071067811865476, + "loss": 0.7275, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.43859794585929307, + "learning_rate": 0.00017052715352713075, + "loss": 0.7169, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.4202743554700086, + "learning_rate": 0.00017034315507498635, + "loss": 0.7884, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.45863226891592085, + "learning_rate": 0.00017015868399847768, + "loss": 0.7542, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.42463800114594674, + "learning_rate": 0.00016997374153703625, + "loss": 0.777, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.46805220582677076, + "learning_rate": 0.00016978832893326074, + "loss": 0.655, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.4856095496588775, + "learning_rate": 0.00016960244743290868, + "loss": 0.8294, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.3937245968763575, + "learning_rate": 0.00016941609828488807, + "loss": 0.7316, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.49058614828004504, + "learning_rate": 0.00016922928274124886, + "loss": 0.7843, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.47806594064348984, + "learning_rate": 0.0001690420020571747, + "loss": 0.7152, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.4731835025299986, + "learning_rate": 0.00016885425749097444, + "loss": 0.7094, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.6731480218012125, + "learning_rate": 0.0001686660503040737, + "loss": 0.7495, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.4297964437043898, + "learning_rate": 0.00016847738176100632, + "loss": 0.7177, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4227575564035185, + "learning_rate": 0.00016828825312940592, + "loss": 0.7613, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.48544793119103596, + "learning_rate": 0.0001680986656799975, + "loss": 0.7532, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5077766955230878, + "learning_rate": 0.0001679086206865886, + "loss": 0.7322, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.4809980875485111, + "learning_rate": 0.00016771811942606108, + "loss": 0.749, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.45732029859816226, + "learning_rate": 0.00016752716317836229, + "loss": 0.7143, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.4662300851432532, + "learning_rate": 0.00016733575322649657, + "loss": 0.7359, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.3967132540424373, + "learning_rate": 0.0001671438908565167, + "loss": 0.7129, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.44430664357222777, + "learning_rate": 0.00016695157735751513, + "loss": 0.7227, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.44023955662980985, + "learning_rate": 0.00016675881402161536, + "loss": 0.7675, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.4169092049722635, + "learning_rate": 0.0001665656021439633, + "loss": 0.7546, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5277552917044552, + "learning_rate": 0.0001663719430227186, + "loss": 0.8091, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 1.2019996023887272, + "learning_rate": 0.00016617783795904565, + "loss": 0.7482, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.903668879284044, + "learning_rate": 0.00016598328825710533, + "loss": 0.7268, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.44840199228934713, + "learning_rate": 0.00016578829522404583, + "loss": 0.782, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5013335808521202, + "learning_rate": 0.000165592860169994, + "loss": 0.7607, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.4213974027032967, + "learning_rate": 0.00016539698440804661, + "loss": 0.7424, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.41993773983541244, + "learning_rate": 0.00016520066925426144, + "loss": 0.771, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.40557131036398736, + "learning_rate": 0.0001650039160276485, + "loss": 0.7058, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4523507100232458, + "learning_rate": 0.0001648067260501611, + "loss": 0.7137, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.4731710302193421, + "learning_rate": 0.0001646091006466871, + "loss": 0.7295, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.42432724880364664, + "learning_rate": 0.0001644110411450398, + "loss": 0.7482, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.4831867061115927, + "learning_rate": 0.00016421254887594917, + "loss": 0.7652, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.41640311207575037, + "learning_rate": 0.00016401362517305296, + "loss": 0.6579, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.4586313986798324, + "learning_rate": 0.00016381427137288754, + "loss": 0.8476, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.41162605214953046, + "learning_rate": 0.00016361448881487914, + "loss": 0.7042, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.4038408611576945, + "learning_rate": 0.0001634142788413346, + "loss": 0.7604, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.3915146244892308, + "learning_rate": 0.00016321364279743266, + "loss": 0.7389, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.4308339515168911, + "learning_rate": 0.00016301258203121462, + "loss": 0.7486, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.37800420601980367, + "learning_rate": 0.0001628110978935756, + "loss": 0.6813, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.39802369432443996, + "learning_rate": 0.00016260919173825508, + "loss": 0.7324, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.38100120954656946, + "learning_rate": 0.00016240686492182804, + "loss": 0.6791, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.4336361465865306, + "learning_rate": 0.00016220411880369601, + "loss": 0.7019, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.46580240140204016, + "learning_rate": 0.00016200095474607753, + "loss": 0.8084, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.44591142184346877, + "learning_rate": 0.00016179737411399926, + "loss": 0.8015, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4126781708157891, + "learning_rate": 0.00016159337827528685, + "loss": 0.7149, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.40599820091049194, + "learning_rate": 0.00016138896860055555, + "loss": 0.7162, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.47857559032252306, + "learning_rate": 0.0001611841464632011, + "loss": 0.7205, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.38736860683906243, + "learning_rate": 0.00016097891323939062, + "loss": 0.7345, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3911052800141332, + "learning_rate": 0.0001607732703080532, + "loss": 0.6961, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.4099751708672859, + "learning_rate": 0.00016056721905087056, + "loss": 0.7876, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.4789589912306157, + "learning_rate": 0.00016036076085226814, + "loss": 0.7504, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.5146265447705376, + "learning_rate": 0.00016015389709940538, + "loss": 0.845, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.634963848674221, + "learning_rate": 0.0001599466291821666, + "loss": 0.788, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.42427227666675715, + "learning_rate": 0.0001597389584931517, + "loss": 0.7113, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.4442940256282359, + "learning_rate": 0.0001595308864276666, + "loss": 0.7814, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.4426652054664872, + "learning_rate": 0.0001593224143837142, + "loss": 0.7697, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.4317837128486373, + "learning_rate": 0.0001591135437619847, + "loss": 0.6493, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.4482570698551614, + "learning_rate": 0.00015890427596584617, + "loss": 0.7345, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.44129565114368685, + "learning_rate": 0.0001586946124013354, + "loss": 0.692, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.4361340943441822, + "learning_rate": 0.00015848455447714822, + "loss": 0.7153, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.48161014364941834, + "learning_rate": 0.0001582741036046301, + "loss": 0.7397, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.40021146044944383, + "learning_rate": 0.00015806326119776663, + "loss": 0.7628, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4391173187278975, + "learning_rate": 0.00015785202867317407, + "loss": 0.7228, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.39810258693029316, + "learning_rate": 0.00015764040745008988, + "loss": 0.7685, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4328481242077568, + "learning_rate": 0.00015742839895036305, + "loss": 0.7482, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.4028096379053827, + "learning_rate": 0.00015721600459844468, + "loss": 0.6994, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.46181503625336007, + "learning_rate": 0.00015700322582137827, + "loss": 0.808, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.5113755956738689, + "learning_rate": 0.00015679006404879033, + "loss": 0.7563, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.44512900415147594, + "learning_rate": 0.0001565765207128805, + "loss": 0.7207, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.4611652590698016, + "learning_rate": 0.00015636259724841222, + "loss": 0.7583, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4591918606931989, + "learning_rate": 0.0001561482950927029, + "loss": 0.6874, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.45267916132053, + "learning_rate": 0.00015593361568561428, + "loss": 0.7577, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.48815860688357016, + "learning_rate": 0.00015571856046954285, + "loss": 0.7242, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.4539180685668656, + "learning_rate": 0.0001555031308894101, + "loss": 0.7508, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5388177877102638, + "learning_rate": 0.00015528732839265272, + "loss": 0.7545, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.3950822482754754, + "learning_rate": 0.0001550711544292131, + "loss": 0.7153, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.4450384687439473, + "learning_rate": 0.0001548546104515294, + "loss": 0.7622, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.4177962669164234, + "learning_rate": 0.00015463769791452574, + "loss": 0.7639, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.40009315582507743, + "learning_rate": 0.00015442041827560274, + "loss": 0.7264, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.5054917593028728, + "learning_rate": 0.00015420277299462736, + "loss": 0.7599, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.548084106630373, + "learning_rate": 0.00015398476353392323, + "loss": 0.7236, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.44702097951848013, + "learning_rate": 0.00015376639135826107, + "loss": 0.7897, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4562795227267379, + "learning_rate": 0.00015354765793484834, + "loss": 0.7376, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.43127424064408904, + "learning_rate": 0.00015332856473331978, + "loss": 0.7718, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.45822416474456973, + "learning_rate": 0.00015310911322572753, + "loss": 0.7834, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.39743796981470814, + "learning_rate": 0.00015288930488653094, + "loss": 0.6835, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.4899454842148273, + "learning_rate": 0.000152669141192587, + "loss": 0.8564, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.42129591015531603, + "learning_rate": 0.0001524486236231402, + "loss": 0.715, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3975991085405616, + "learning_rate": 0.00015222775365981273, + "loss": 0.7243, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.42462308414974403, + "learning_rate": 0.00015200653278659432, + "loss": 0.8114, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.47903098108233355, + "learning_rate": 0.00015178496248983254, + "loss": 0.7138, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.4577855193694479, + "learning_rate": 0.00015156304425822267, + "loss": 0.8058, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.43580528832715576, + "learning_rate": 0.00015134077958279765, + "loss": 0.783, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.48304346147280797, + "learning_rate": 0.00015111816995691809, + "loss": 0.7576, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.40679595500463156, + "learning_rate": 0.00015089521687626243, + "loss": 0.659, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.4437871417383588, + "learning_rate": 0.00015067192183881658, + "loss": 0.743, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.48954633510827106, + "learning_rate": 0.000150448286344864, + "loss": 0.8154, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.44973543847141406, + "learning_rate": 0.00015022431189697568, + "loss": 0.7579, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.5524658691503546, + "learning_rate": 0.00015000000000000001, + "loss": 0.8149, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.4396765832399089, + "learning_rate": 0.0001497753521610526, + "loss": 0.6877, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4406786552471726, + "learning_rate": 0.00014955036988950618, + "loss": 0.715, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.41322964282223196, + "learning_rate": 0.00014932505469698052, + "loss": 0.7443, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4604077012727492, + "learning_rate": 0.00014909940809733222, + "loss": 0.7457, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.43228587119062767, + "learning_rate": 0.0001488734316066446, + "loss": 0.7069, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3866639819270244, + "learning_rate": 0.00014864712674321734, + "loss": 0.6804, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.4072491204192593, + "learning_rate": 0.0001484204950275565, + "loss": 0.7182, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.4158115899003882, + "learning_rate": 0.00014819353798236427, + "loss": 0.6316, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.41856795470122343, + "learning_rate": 0.00014796625713252848, + "loss": 0.6877, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.44032268508903893, + "learning_rate": 0.00014773865400511272, + "loss": 0.7754, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.3793269364037703, + "learning_rate": 0.00014751073012934587, + "loss": 0.6997, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4073378870722764, + "learning_rate": 0.00014728248703661182, + "loss": 0.6769, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.42363471968666755, + "learning_rate": 0.0001470539262604393, + "loss": 0.7889, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4520135230580361, + "learning_rate": 0.00014682504933649144, + "loss": 0.7619, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.4280710487309365, + "learning_rate": 0.00014659585780255556, + "loss": 0.7166, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.40250832106697515, + "learning_rate": 0.00014636635319853275, + "loss": 0.7183, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.42287049275064076, + "learning_rate": 0.0001461365370664276, + "loss": 0.7637, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.3961706966946667, + "learning_rate": 0.00014590641095033787, + "loss": 0.7116, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.47852621016581787, + "learning_rate": 0.00014567597639644387, + "loss": 0.789, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.42921164444493115, + "learning_rate": 0.00014544523495299842, + "loss": 0.7491, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.4231796426849485, + "learning_rate": 0.00014521418817031628, + "loss": 0.7023, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5219949806476437, + "learning_rate": 0.0001449828376007636, + "loss": 0.8286, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.4417614083672534, + "learning_rate": 0.00014475118479874774, + "loss": 0.6982, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.398529604058241, + "learning_rate": 0.0001445192313207067, + "loss": 0.7249, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.4063119021543565, + "learning_rate": 0.0001442869787250987, + "loss": 0.7563, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4150229671695859, + "learning_rate": 0.0001440544285723915, + "loss": 0.6996, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.41294976755644697, + "learning_rate": 0.00014382158242505234, + "loss": 0.7335, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.46171998521483076, + "learning_rate": 0.00014358844184753712, + "loss": 0.7272, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.43386019673425774, + "learning_rate": 0.00014335500840627986, + "loss": 0.7348, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4748047451711399, + "learning_rate": 0.00014312128366968243, + "loss": 0.762, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.4530528969151596, + "learning_rate": 0.0001428872692081038, + "loss": 0.705, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.7154024051831739, + "learning_rate": 0.00014265296659384956, + "loss": 0.7291, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.38238003488071715, + "learning_rate": 0.00014241837740116132, + "loss": 0.6948, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.42954402140844056, + "learning_rate": 0.00014218350320620624, + "loss": 0.7365, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.43920465231655637, + "learning_rate": 0.00014194834558706632, + "loss": 0.6759, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.50806928834242, + "learning_rate": 0.0001417129061237278, + "loss": 0.7163, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.49100412658177245, + "learning_rate": 0.0001414771863980707, + "loss": 0.7757, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.4265044923408589, + "learning_rate": 0.00014124118799385796, + "loss": 0.6973, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.4485524583028315, + "learning_rate": 0.00014100491249672498, + "loss": 0.6689, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.43226811757877065, + "learning_rate": 0.00014076836149416887, + "loss": 0.7459, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.416201547102863, + "learning_rate": 0.0001405315365755379, + "loss": 0.7209, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.39664734173834565, + "learning_rate": 0.0001402944393320206, + "loss": 0.6513, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.5873619252994348, + "learning_rate": 0.00014005707135663527, + "loss": 0.6968, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.40793179787572403, + "learning_rate": 0.00013981943424421932, + "loss": 0.7358, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.4228884403871606, + "learning_rate": 0.00013958152959141825, + "loss": 0.7642, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.41060499981163073, + "learning_rate": 0.00013934335899667527, + "loss": 0.7101, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.435862591273703, + "learning_rate": 0.00013910492406022033, + "loss": 0.6965, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.4869556356346243, + "learning_rate": 0.00013886622638405952, + "loss": 0.7007, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.36488581075679316, + "learning_rate": 0.0001386272675719642, + "loss": 0.6422, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.43592851398129917, + "learning_rate": 0.00013838804922946027, + "loss": 0.7195, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.4296963476221147, + "learning_rate": 0.00013814857296381728, + "loss": 0.7473, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.37645455777538245, + "learning_rate": 0.00013790884038403795, + "loss": 0.6946, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.3967480623621568, + "learning_rate": 0.00013766885310084688, + "loss": 0.7451, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.39790923257593114, + "learning_rate": 0.00013742861272668012, + "loss": 0.7127, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.44216137091031993, + "learning_rate": 0.00013718812087567414, + "loss": 0.7593, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.39409587111420574, + "learning_rate": 0.00013694737916365517, + "loss": 0.6994, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.37628399571003923, + "learning_rate": 0.000136706389208128, + "loss": 0.6659, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.40137145922137873, + "learning_rate": 0.00013646515262826552, + "loss": 0.7574, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.40575702287687193, + "learning_rate": 0.00013622367104489756, + "loss": 0.7531, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.40095939343185877, + "learning_rate": 0.0001359819460805001, + "loss": 0.7255, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.5328369689927243, + "learning_rate": 0.0001357399793591844, + "loss": 0.7035, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.41558442501880183, + "learning_rate": 0.0001354977725066859, + "loss": 0.5965, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.45450727782192485, + "learning_rate": 0.00013525532715035366, + "loss": 0.7154, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4429452206189218, + "learning_rate": 0.00013501264491913906, + "loss": 0.7424, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.5286021883772923, + "learning_rate": 0.00013476972744358507, + "loss": 0.6966, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4210548211045082, + "learning_rate": 0.0001345265763558152, + "loss": 0.7629, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.4328737270139645, + "learning_rate": 0.00013428319328952253, + "loss": 0.7113, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.47783936685408646, + "learning_rate": 0.00013403957987995882, + "loss": 0.7542, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.4276078646841453, + "learning_rate": 0.0001337957377639235, + "loss": 0.7391, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.43223865506668013, + "learning_rate": 0.0001335516685797525, + "loss": 0.7755, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.4071862625631177, + "learning_rate": 0.0001333073739673076, + "loss": 0.7043, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.5165019174887554, + "learning_rate": 0.00013306285556796495, + "loss": 0.6902, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.3892717255463915, + "learning_rate": 0.0001328181150246045, + "loss": 0.7137, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.42836054777343024, + "learning_rate": 0.00013257315398159864, + "loss": 0.7395, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.4173728809888404, + "learning_rate": 0.00013232797408480127, + "loss": 0.742, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.3535504159455629, + "learning_rate": 0.00013208257698153677, + "loss": 0.6703, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.3676936927081411, + "learning_rate": 0.00013183696432058888, + "loss": 0.6899, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.42111910180024936, + "learning_rate": 0.00013159113775218964, + "loss": 0.7787, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.4116016278981681, + "learning_rate": 0.00013134509892800822, + "loss": 0.7291, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4031029250802671, + "learning_rate": 0.00013109884950114007, + "loss": 0.7056, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.3835513414134938, + "learning_rate": 0.00013085239112609547, + "loss": 0.7112, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.46509263748679647, + "learning_rate": 0.00013060572545878875, + "loss": 0.7185, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.5746143007898875, + "learning_rate": 0.00013035885415652685, + "loss": 0.7493, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.40227934573081, + "learning_rate": 0.00013011177887799845, + "loss": 0.7322, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.4230454693600124, + "learning_rate": 0.00012986450128326266, + "loss": 0.7309, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.40336316080918005, + "learning_rate": 0.00012961702303373795, + "loss": 0.6341, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.4351510587979726, + "learning_rate": 0.00012936934579219094, + "loss": 0.7105, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.39495593892561437, + "learning_rate": 0.00012912147122272523, + "loss": 0.7168, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.3967426580793475, + "learning_rate": 0.00012887340099077024, + "loss": 0.7194, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4147208134179952, + "learning_rate": 0.00012862513676307008, + "loss": 0.7238, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.36967969461923617, + "learning_rate": 0.0001283766802076722, + "loss": 0.6632, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.4287011586351211, + "learning_rate": 0.00012812803299391628, + "loss": 0.7401, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.399649606969939, + "learning_rate": 0.00012787919679242306, + "loss": 0.7814, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3733433687843168, + "learning_rate": 0.00012763017327508305, + "loss": 0.6759, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.4567797291797252, + "learning_rate": 0.00012738096411504522, + "loss": 0.7411, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.39111249426608224, + "learning_rate": 0.0001271315709867059, + "loss": 0.6682, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.399184072015993, + "learning_rate": 0.00012688199556569753, + "loss": 0.7405, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.3623284143286997, + "learning_rate": 0.00012663223952887723, + "loss": 0.7323, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.4369971125336579, + "learning_rate": 0.0001263823045543158, + "loss": 0.7521, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.3939866311585749, + "learning_rate": 0.00012613219232128608, + "loss": 0.7191, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.44007330550179097, + "learning_rate": 0.00012588190451025207, + "loss": 0.7526, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4623133533657372, + "learning_rate": 0.00012563144280285741, + "loss": 0.7398, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.4356767891279233, + "learning_rate": 0.00012538080888191408, + "loss": 0.6771, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.44781292119353033, + "learning_rate": 0.00012513000443139112, + "loss": 0.6905, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.4350358948741196, + "learning_rate": 0.00012487903113640337, + "loss": 0.7757, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4263688147824068, + "learning_rate": 0.00012462789068320017, + "loss": 0.7146, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.3918373554164718, + "learning_rate": 0.00012437658475915377, + "loss": 0.6814, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.42554161469132323, + "learning_rate": 0.00012412511505274844, + "loss": 0.7097, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.4510043717815143, + "learning_rate": 0.00012387348325356874, + "loss": 0.7397, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.40548676560106917, + "learning_rate": 0.00012362169105228826, + "loss": 0.7188, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.3930449013275765, + "learning_rate": 0.00012336974014065844, + "loss": 0.6938, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.37686291795705645, + "learning_rate": 0.000123117632211497, + "loss": 0.6823, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.34403095650051896, + "learning_rate": 0.00012286536895867654, + "loss": 0.7023, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4224804903041466, + "learning_rate": 0.00012261295207711346, + "loss": 0.7154, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.464741312247205, + "learning_rate": 0.00012236038326275626, + "loss": 0.7756, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.41450530264771623, + "learning_rate": 0.0001221076642125742, + "loss": 0.7133, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.42338494134964905, + "learning_rate": 0.00012185479662454595, + "loss": 0.7451, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.39664522174179934, + "learning_rate": 0.00012160178219764837, + "loss": 0.6868, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.4214350598528371, + "learning_rate": 0.00012134862263184467, + "loss": 0.7087, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.4460593375778333, + "learning_rate": 0.00012109531962807332, + "loss": 0.698, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.41866076775674965, + "learning_rate": 0.00012084187488823657, + "loss": 0.7361, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.38057473365898153, + "learning_rate": 0.00012058829011518896, + "loss": 0.751, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.38737097939391985, + "learning_rate": 0.00012033456701272576, + "loss": 0.7236, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.5171385787171731, + "learning_rate": 0.00012008070728557186, + "loss": 0.7091, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.39932206928896563, + "learning_rate": 0.00011982671263936995, + "loss": 0.7045, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.45276083810766304, + "learning_rate": 0.00011957258478066931, + "loss": 0.665, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.4546548462423871, + "learning_rate": 0.00011931832541691418, + "loss": 0.7201, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.4317938044917659, + "learning_rate": 0.00011906393625643244, + "loss": 0.7137, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.45750459611787675, + "learning_rate": 0.00011880941900842397, + "loss": 0.7061, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4054347668668675, + "learning_rate": 0.00011855477538294935, + "loss": 0.6743, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.3820550665195405, + "learning_rate": 0.00011830000709091815, + "loss": 0.7234, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.5419350535886606, + "learning_rate": 0.00011804511584407763, + "loss": 0.8053, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.42611264604135285, + "learning_rate": 0.0001177901033550012, + "loss": 0.7728, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3519728305830471, + "learning_rate": 0.00011753497133707679, + "loss": 0.6742, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.4089474340694498, + "learning_rate": 0.00011727972150449544, + "loss": 0.6596, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.41015990254032864, + "learning_rate": 0.00011702435557223987, + "loss": 0.6952, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.38680365195050437, + "learning_rate": 0.00011676887525607271, + "loss": 0.7113, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.4212260550526898, + "learning_rate": 0.00011651328227252517, + "loss": 0.7106, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.48743070631599167, + "learning_rate": 0.00011625757833888551, + "loss": 0.7265, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.36740607737976266, + "learning_rate": 0.00011600176517318741, + "loss": 0.719, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.43253030252457, + "learning_rate": 0.0001157458444941984, + "loss": 0.7987, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.430381763981451, + "learning_rate": 0.00011548981802140848, + "loss": 0.7449, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.39252269540792745, + "learning_rate": 0.00011523368747501839, + "loss": 0.6505, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.43452553027659485, + "learning_rate": 0.00011497745457592816, + "loss": 0.7067, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.4112649946570928, + "learning_rate": 0.00011472112104572547, + "loss": 0.669, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.43254489628109855, + "learning_rate": 0.00011446468860667421, + "loss": 0.7559, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.3854223725240552, + "learning_rate": 0.0001142081589817027, + "loss": 0.7326, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.39943296691131486, + "learning_rate": 0.00011395153389439233, + "loss": 0.7384, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.3999457589988329, + "learning_rate": 0.00011369481506896582, + "loss": 0.7388, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.35882038109855535, + "learning_rate": 0.00011343800423027582, + "loss": 0.6333, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.4140687920652767, + "learning_rate": 0.00011318110310379301, + "loss": 0.7279, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3740753880813183, + "learning_rate": 0.0001129241134155949, + "loss": 0.7056, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.46208895798030497, + "learning_rate": 0.00011266703689235394, + "loss": 0.7335, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.46377081662772324, + "learning_rate": 0.00011240987526132594, + "loss": 0.716, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.38305067732640785, + "learning_rate": 0.00011215263025033869, + "loss": 0.6512, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.350180861720446, + "learning_rate": 0.00011189530358778005, + "loss": 0.6376, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.4480134312250272, + "learning_rate": 0.00011163789700258655, + "loss": 0.771, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.3995371197659842, + "learning_rate": 0.00011138041222423177, + "loss": 0.657, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.3841862716369333, + "learning_rate": 0.00011112285098271451, + "loss": 0.6695, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.33530177000934397, + "learning_rate": 0.00011086521500854745, + "loss": 0.6374, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.37125518410251107, + "learning_rate": 0.00011060750603274535, + "loss": 0.6261, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4267792415551032, + "learning_rate": 0.00011034972578681338, + "loss": 0.6829, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.42046426161249545, + "learning_rate": 0.00011009187600273566, + "loss": 0.7089, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.461124919714734, + "learning_rate": 0.00010983395841296348, + "loss": 0.7683, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.3835175921110268, + "learning_rate": 0.00010957597475040373, + "loss": 0.7167, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4566024997195662, + "learning_rate": 0.00010931792674840718, + "loss": 0.787, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.40318473698590684, + "learning_rate": 0.00010905981614075693, + "loss": 0.6743, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.4285776183346902, + "learning_rate": 0.00010880164466165674, + "loss": 0.7106, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.43612420650390027, + "learning_rate": 0.00010854341404571928, + "loss": 0.6738, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3378784964593356, + "learning_rate": 0.00010828512602795462, + "loss": 0.6831, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.4049813905815682, + "learning_rate": 0.00010802678234375851, + "loss": 0.7191, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.37071274223937906, + "learning_rate": 0.00010776838472890065, + "loss": 0.6722, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.4123650756025684, + "learning_rate": 0.0001075099349195131, + "loss": 0.6982, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4007101740530657, + "learning_rate": 0.00010725143465207867, + "loss": 0.6597, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.40314128961867496, + "learning_rate": 0.00010699288566341914, + "loss": 0.7548, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3827117028203895, + "learning_rate": 0.00010673428969068364, + "loss": 0.6954, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.3743203935131224, + "learning_rate": 0.000106475648471337, + "loss": 0.7035, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.4068759593814105, + "learning_rate": 0.00010621696374314807, + "loss": 0.7002, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.4125840511413991, + "learning_rate": 0.00010595823724417795, + "loss": 0.7444, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.4430040213051985, + "learning_rate": 0.00010569947071276847, + "loss": 0.7316, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.3972766518463086, + "learning_rate": 0.00010544066588753044, + "loss": 0.732, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3710596730970979, + "learning_rate": 0.00010518182450733186, + "loss": 0.7097, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.4166041804263977, + "learning_rate": 0.00010492294831128641, + "loss": 0.7319, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.42407249404557373, + "learning_rate": 0.00010466403903874176, + "loss": 0.7102, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.40428242410174164, + "learning_rate": 0.00010440509842926767, + "loss": 0.6893, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4090237750909439, + "learning_rate": 0.00010414612822264455, + "loss": 0.6569, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.41322585430378184, + "learning_rate": 0.00010388713015885161, + "loss": 0.6852, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.4034583736535131, + "learning_rate": 0.00010362810597805526, + "loss": 0.7315, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.38020879026638044, + "learning_rate": 0.00010336905742059742, + "loss": 0.6595, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4534286548708508, + "learning_rate": 0.0001031099862269837, + "loss": 0.6826, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.47032200968307747, + "learning_rate": 0.0001028508941378719, + "loss": 0.672, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.49084443662279653, + "learning_rate": 0.00010259178289406011, + "loss": 0.7733, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.44710860334300656, + "learning_rate": 0.00010233265423647523, + "loss": 0.7091, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4431207850279007, + "learning_rate": 0.00010207350990616107, + "loss": 0.7457, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.42285002870095373, + "learning_rate": 0.00010181435164426676, + "loss": 0.6834, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.45191770643994017, + "learning_rate": 0.0001015551811920351, + "loss": 0.6795, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.4195601822442567, + "learning_rate": 0.00010129600029079072, + "loss": 0.6417, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.39460985762925294, + "learning_rate": 0.00010103681068192845, + "loss": 0.7023, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.44789571155478425, + "learning_rate": 0.00010077761410690172, + "loss": 0.7759, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4300703454189236, + "learning_rate": 0.00010051841230721065, + "loss": 0.7448, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.40179064279153015, + "learning_rate": 0.00010025920702439051, + "loss": 0.7139, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4251526854618008, + "learning_rate": 0.0001, + "loss": 0.624, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.39057488078696945, + "learning_rate": 9.97407929756095e-05, + "loss": 0.6603, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.42630707149847313, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6358, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.5327749683651356, + "learning_rate": 9.92223858930983e-05, + "loss": 0.6579, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.40059686695101615, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6655, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.4818593418381336, + "learning_rate": 9.870399970920932e-05, + "loss": 0.6803, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.3831421544104826, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6959, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.4385493137510518, + "learning_rate": 9.818564835573323e-05, + "loss": 0.6315, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.4187935330881887, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7037, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.417803657805014, + "learning_rate": 9.766734576352478e-05, + "loss": 0.7078, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4762132158687772, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7378, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.4119388003608324, + "learning_rate": 9.714910586212816e-05, + "loss": 0.7254, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.41486154550669097, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6415, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.3712347933088719, + "learning_rate": 9.663094257940258e-05, + "loss": 0.6451, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.40303060884995057, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6969, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.4619075618967914, + "learning_rate": 9.611286984114841e-05, + "loss": 0.7211, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.3546479520494259, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6806, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.4383990518530939, + "learning_rate": 9.559490157073236e-05, + "loss": 0.6902, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.4662434092306648, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7011, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.4028329669021629, + "learning_rate": 9.507705168871358e-05, + "loss": 0.6728, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.42368747763122155, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6635, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.4726867637244179, + "learning_rate": 9.455933411246958e-05, + "loss": 0.7119, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4215263629263457, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7225, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.43626293441672603, + "learning_rate": 9.404176275582208e-05, + "loss": 0.7973, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.39943765901908274, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7022, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.4112569173604557, + "learning_rate": 9.352435152866298e-05, + "loss": 0.6989, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.3858144331806179, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6243, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.6098326779465145, + "learning_rate": 9.300711433658087e-05, + "loss": 0.7532, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.37361671169949173, + "learning_rate": 9.274856534792138e-05, + "loss": 0.6195, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.4370848116862876, + "learning_rate": 9.249006508048694e-05, + "loss": 0.7093, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.39664569567571945, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7014, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.40670887161198743, + "learning_rate": 9.197321765624152e-05, + "loss": 0.693, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.40726690010248706, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6217, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.44412816216991685, + "learning_rate": 9.145658595428074e-05, + "loss": 0.6866, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.426728437560957, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7291, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.4164401946897966, + "learning_rate": 9.09401838592431e-05, + "loss": 0.7369, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.4321081763133884, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6376, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.4731745394937619, + "learning_rate": 9.04240252495963e-05, + "loss": 0.7111, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.39995757145877464, + "learning_rate": 9.016604158703654e-05, + "loss": 0.694, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.4364384119760376, + "learning_rate": 8.990812399726435e-05, + "loss": 0.6365, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4114799763475482, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7632, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.42800537132030453, + "learning_rate": 8.939249396725467e-05, + "loss": 0.7324, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.43782200893842077, + "learning_rate": 8.913478499145254e-05, + "loss": 0.704, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.4301697913751564, + "learning_rate": 8.887714901728551e-05, + "loss": 0.6989, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3923326776816395, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7393, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.4355980113522696, + "learning_rate": 8.836210299741346e-05, + "loss": 0.6924, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.4404265439104104, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6794, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.41387428034521984, + "learning_rate": 8.784736974966135e-05, + "loss": 0.7322, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.390878116934256, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6451, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.4365440266919981, + "learning_rate": 8.733296310764611e-05, + "loss": 0.6906, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.42202109340957455, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6962, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.4681452856192567, + "learning_rate": 8.6818896896207e-05, + "loss": 0.7254, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4647895188575558, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7099, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.47224116271446176, + "learning_rate": 8.63051849310342e-05, + "loss": 0.6874, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.3767664595638354, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6404, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.4312374747931622, + "learning_rate": 8.579184101829734e-05, + "loss": 0.717, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.4754052770334311, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7162, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.5014028514766248, + "learning_rate": 8.527887895427454e-05, + "loss": 0.6555, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3734741582608827, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6584, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.44542244524799046, + "learning_rate": 8.476631252498162e-05, + "loss": 0.7316, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4206856768160065, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6753, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.42678188555779784, + "learning_rate": 8.425415550580162e-05, + "loss": 0.6847, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3635002403845702, + "learning_rate": 8.399823482681262e-05, + "loss": 0.627, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.4065817189128296, + "learning_rate": 8.374242166111448e-05, + "loss": 0.6938, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4316955926145281, + "learning_rate": 8.348671772747487e-05, + "loss": 0.701, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.34775313117517553, + "learning_rate": 8.323112474392731e-05, + "loss": 0.6375, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.4347971567231205, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7339, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.3870375456359733, + "learning_rate": 8.272027849550457e-05, + "loss": 0.7262, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4507678296290438, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6907, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.42712214159726214, + "learning_rate": 8.220989664499878e-05, + "loss": 0.7668, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.41015505420405474, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6888, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.39763102750325163, + "learning_rate": 8.169999290908188e-05, + "loss": 0.6419, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4191530147392954, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7052, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.46946824442040413, + "learning_rate": 8.119058099157604e-05, + "loss": 0.5974, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.3728556157947248, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6967, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.4043781316642281, + "learning_rate": 8.068167458308582e-05, + "loss": 0.6936, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.42578695870834243, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7122, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.3696512046075325, + "learning_rate": 8.017328736063006e-05, + "loss": 0.6606, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4591620718922479, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7154, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.43031506983236373, + "learning_rate": 7.966543298727425e-05, + "loss": 0.7474, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.37611435319488984, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6613, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.4398367622705673, + "learning_rate": 7.915812511176347e-05, + "loss": 0.6985, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.4606668684235599, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7546, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.4454089848272011, + "learning_rate": 7.865137736815535e-05, + "loss": 0.7565, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3796944207616218, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6453, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.4870938883020908, + "learning_rate": 7.814520337545406e-05, + "loss": 0.6231, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.3919047888701881, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6756, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.351449737980625, + "learning_rate": 7.763961673724379e-05, + "loss": 0.6984, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.42252464066428147, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6686, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.3914560028501668, + "learning_rate": 7.713463104132345e-05, + "loss": 0.6908, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.40295174723623234, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6445, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.39073172302904613, + "learning_rate": 7.663025985934158e-05, + "loss": 0.6937, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3709963125321195, + "learning_rate": 7.637830894771175e-05, + "loss": 0.668, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.3933866238712117, + "learning_rate": 7.61265167464313e-05, + "loss": 0.6484, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4063031184543873, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6566, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.40379378551884454, + "learning_rate": 7.562341524084623e-05, + "loss": 0.6959, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.46800810242234814, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7401, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.41783370423759914, + "learning_rate": 7.512096886359664e-05, + "loss": 0.6845, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3445165919380514, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6381, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.34281709881514283, + "learning_rate": 7.461919111808595e-05, + "loss": 0.6033, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4911202166509181, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7597, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.4453953009786171, + "learning_rate": 7.411809548974792e-05, + "loss": 0.6656, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.41829674492066976, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7278, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.4627807550930201, + "learning_rate": 7.361769544568425e-05, + "loss": 0.7188, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.46143136545483737, + "learning_rate": 7.336776047112276e-05, + "loss": 0.678, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.42262247738767705, + "learning_rate": 7.311800443430251e-05, + "loss": 0.6266, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.36706273204871864, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6833, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.36955889205145553, + "learning_rate": 7.26190358849548e-05, + "loss": 0.6326, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4051583767582194, + "learning_rate": 7.236982672491698e-05, + "loss": 0.641, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.40047164298288446, + "learning_rate": 7.212080320757695e-05, + "loss": 0.6597, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.39501334821182466, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6666, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.382263551767098, + "learning_rate": 7.162331979232783e-05, + "loss": 0.6689, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.44546686537191926, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6527, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.47193164450770997, + "learning_rate": 7.112659900922976e-05, + "loss": 0.6341, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.46796251950243534, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7434, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.41135014400587067, + "learning_rate": 7.06306542078091e-05, + "loss": 0.6632, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.35174694214103835, + "learning_rate": 7.038297696626206e-05, + "loss": 0.5599, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.4261893917296931, + "learning_rate": 7.013549871673736e-05, + "loss": 0.7009, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.4113412739863776, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6665, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.40679542718728934, + "learning_rate": 6.964114584347316e-05, + "loss": 0.6701, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3926481491746906, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6119, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.39778211443742756, + "learning_rate": 6.914760887390452e-05, + "loss": 0.6747, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.44917308211135637, + "learning_rate": 6.890115049885994e-05, + "loss": 0.747, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.4236705214830975, + "learning_rate": 6.865490107199181e-05, + "loss": 0.6911, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4014067169197921, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6786, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.37858996457960126, + "learning_rate": 6.816303567941112e-05, + "loss": 0.7139, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.3926606726180132, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6702, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.4258896736184436, + "learning_rate": 6.767202591519875e-05, + "loss": 0.7064, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.4023392887251313, + "learning_rate": 6.742684601840141e-05, + "loss": 0.668, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.5210062335588069, + "learning_rate": 6.718188497539554e-05, + "loss": 0.7616, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.40328490054705385, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6838, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.3889947109465521, + "learning_rate": 6.669262603269246e-05, + "loss": 0.6857, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.378072964715034, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6649, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.3841233525812373, + "learning_rate": 6.620426223607654e-05, + "loss": 0.6999, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.5040506704424114, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6357, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.4034614415798281, + "learning_rate": 6.571680671047749e-05, + "loss": 0.7319, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.4226046627171998, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7206, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.3863173873353244, + "learning_rate": 6.523027255641493e-05, + "loss": 0.6538, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4509624943489986, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6813, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.4325182683212749, + "learning_rate": 6.474467284964634e-05, + "loss": 0.6811, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4304866664044165, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6954, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.40887192147637236, + "learning_rate": 6.426002064081565e-05, + "loss": 0.6923, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.47669886743863876, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7068, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.3654067929691481, + "learning_rate": 6.377632895510248e-05, + "loss": 0.6683, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.383565537193584, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6437, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.3893638313547271, + "learning_rate": 6.329361079187199e-05, + "loss": 0.6859, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.3927667218699901, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6595, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.38977229894335297, + "learning_rate": 6.281187912432587e-05, + "loss": 0.6497, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4012048981102772, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6744, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.4205859614124761, + "learning_rate": 6.233114689915316e-05, + "loss": 0.7039, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.4238390039930889, + "learning_rate": 6.209115961596208e-05, + "loss": 0.5847, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.5431135837848629, + "learning_rate": 6.18514270361827e-05, + "loss": 0.7442, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3841051928298376, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6183, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.3697223969819957, + "learning_rate": 6.13727324280358e-05, + "loss": 0.6375, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.3590098236656919, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6404, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.4125776500113079, + "learning_rate": 6.08950759397797e-05, + "loss": 0.6878, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.3947146565222585, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7141, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.3694402423492765, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.7004, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.3835766426472468, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6935, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.37080887534840773, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.5795, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.43723240190470886, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6628, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.39385458611995466, + "learning_rate": 5.946846342446214e-05, + "loss": 0.7022, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3901449855591286, + "learning_rate": 5.923163850583113e-05, + "loss": 0.647, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.42770788167662127, + "learning_rate": 5.899508750327501e-05, + "loss": 0.7048, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4896025780748295, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6736, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.3939777926516438, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.722, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.41252256589683123, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6749, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.4230612348858875, + "learning_rate": 5.80516544129337e-05, + "loss": 0.6704, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4217687932057089, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6463, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.36426708106640004, + "learning_rate": 5.758162259883867e-05, + "loss": 0.6484, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3953184886570678, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6941, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.38223814139919626, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.6297, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.416064650999233, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6965, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.4004307146179374, + "learning_rate": 5.664499159372017e-05, + "loss": 0.6475, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.4708079297383675, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6851, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.4290088114427895, + "learning_rate": 5.617841757494762e-05, + "loss": 0.6676, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.39728379907446343, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7086, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.3539627879192858, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.6368, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4516247553177675, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7271, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.3719122344570917, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6593, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4201357728075419, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7337, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.3856777395195659, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.6698, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4191793263602028, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6846, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.5023677642761707, + "learning_rate": 5.432402360355615e-05, + "loss": 0.7193, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.40152491674816604, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6647, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.5021325234550964, + "learning_rate": 5.386346293357242e-05, + "loss": 0.7424, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.3663233920022643, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6455, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.3766987285711635, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.6492, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.39615195638083806, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6335, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.3727499688053087, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.6462, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4314468517020133, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6355, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.39132260159381754, + "learning_rate": 5.248926987065417e-05, + "loss": 0.6415, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.41696845922044734, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6324, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.43720442904996637, + "learning_rate": 5.203374286747158e-05, + "loss": 0.683, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4182924705742696, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6227, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.35803751020012203, + "learning_rate": 5.15795049724435e-05, + "loss": 0.6363, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.35708142177875307, + "learning_rate": 5.135287325678271e-05, + "loss": 0.646, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.4616648828250431, + "learning_rate": 5.112656839335543e-05, + "loss": 0.6798, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4188551672236205, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6459, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.46822151383276933, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.6995, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.44004281585292426, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6202, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.3573156653645513, + "learning_rate": 5.022464783894744e-05, + "loss": 0.5897, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.40237252162483383, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6221, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.3975645475240448, + "learning_rate": 4.977568810302432e-05, + "loss": 0.6496, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.38499345755069325, + "learning_rate": 4.955171365513603e-05, + "loss": 0.682, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.5140050698466838, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.6829, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.3725159875858386, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6639, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.40950955909005243, + "learning_rate": 4.88818300430819e-05, + "loss": 0.6989, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.46031007485696074, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6312, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.3963141432384529, + "learning_rate": 4.843695574177737e-05, + "loss": 0.6873, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.43177629728606337, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6716, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.36534440801635726, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.7054, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.36222210330217647, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6385, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.39893542029413387, + "learning_rate": 4.755137637685979e-05, + "loss": 0.7219, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.4118250270518911, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7154, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.3568852499449988, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.6567, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.43549006346138774, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7329, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.35674849903182165, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.6475, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.38232166331613066, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6737, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.3909706882327398, + "learning_rate": 4.623360864173893e-05, + "loss": 0.6181, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4524146819591539, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7207, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.48696787150203236, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6191, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.4794679869123501, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6811, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.35425548179888433, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.6335, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4067978349950799, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6612, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.40303274236473946, + "learning_rate": 4.492884557078688e-05, + "loss": 0.6574, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.4237551677595413, + "learning_rate": 4.471267160734731e-05, + "loss": 0.5851, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.42875369891186, + "learning_rate": 4.449686911058992e-05, + "loss": 0.676, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.42706425272560744, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7297, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.41576864434001926, + "learning_rate": 4.406638431438576e-05, + "loss": 0.6471, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.418117804387114, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6621, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.4150020946129471, + "learning_rate": 4.36374027515878e-05, + "loss": 0.6467, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3928687191971617, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6527, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.4160100982513934, + "learning_rate": 4.320993595120969e-05, + "loss": 0.6808, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4794207042809734, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6599, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.46730184137061487, + "learning_rate": 4.278399540155536e-05, + "loss": 0.6519, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.37505905651722327, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6537, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.4232559113614526, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.6731, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.40992758465781476, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6794, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.4058249457233674, + "learning_rate": 4.193673880223339e-05, + "loss": 0.6786, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.4035278599984284, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6481, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.39784788196463317, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.5858, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4318721099836124, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6577, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.3706105798394763, + "learning_rate": 4.109572403415386e-05, + "loss": 0.6809, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.49204290897437447, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6702, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.5342172203437557, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.6571, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.40420590231024983, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6931, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.39025351336075903, + "learning_rate": 4.026104150684835e-05, + "loss": 0.6894, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.42860663505358243, + "learning_rate": 4.00533708178334e-05, + "loss": 0.66, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.40188415325475024, + "learning_rate": 3.984610290059467e-05, + "loss": 0.6811, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5099159876951098, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6812, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.40195803826116805, + "learning_rate": 3.943278094912946e-05, + "loss": 0.6455, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3770629912663238, + "learning_rate": 3.922672969194686e-05, + "loss": 0.5853, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.3849565435590295, + "learning_rate": 3.902108676060937e-05, + "loss": 0.6395, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3636221080946348, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.5883, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.4324813351929036, + "learning_rate": 3.861103139944449e-05, + "loss": 0.627, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.36972044268012877, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6807, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.4246459396787344, + "learning_rate": 3.820262588600074e-05, + "loss": 0.611, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.4095368430242777, + "learning_rate": 3.79990452539225e-05, + "loss": 0.669, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.450062496563086, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.6818, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.45122798759617516, + "learning_rate": 3.759313507817196e-05, + "loss": 0.661, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.4172593406435854, + "learning_rate": 3.739080826174498e-05, + "loss": 0.6722, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.41231268375543334, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6579, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.36609118253065914, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.6195, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3944213557612597, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6892, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.5130378278043198, + "learning_rate": 3.658572115866541e-05, + "loss": 0.6606, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.41001351868405705, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6359, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.36033257918341494, + "learning_rate": 3.618572862711247e-05, + "loss": 0.6272, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.43236393064685835, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6306, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.4308461646312396, + "learning_rate": 3.578745112405083e-05, + "loss": 0.7005, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4317565433068937, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6891, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.4522227837448108, + "learning_rate": 3.539089935331294e-05, + "loss": 0.6608, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.44695081016231847, + "learning_rate": 3.519327394983888e-05, + "loss": 0.7083, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.3542022042021266, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.5939, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.3683736484173129, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7423, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.39741628676824414, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.6767, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.4034667413845989, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7175, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.38925888471791487, + "learning_rate": 3.421170477595419e-05, + "loss": 0.6653, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4160330119149825, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6609, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.40576759925288564, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.6533, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4035520416846824, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6722, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.4009366996579774, + "learning_rate": 3.34343978560367e-05, + "loss": 0.6656, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3973017016693891, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7151, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.4016695475436383, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.645, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.375683111180452, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6926, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.3737568291563691, + "learning_rate": 3.266424677350346e-05, + "loss": 0.6383, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.4321402724189271, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6269, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.654352094348001, + "learning_rate": 3.228188057393895e-05, + "loss": 0.6318, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3697878748099031, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6839, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.4100180548031645, + "learning_rate": 3.190133432000252e-05, + "loss": 0.5577, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.423162638686225, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6166, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.3626995861072231, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.5919, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.487656427977271, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6516, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.4626190649805298, + "learning_rate": 3.114574250902558e-05, + "loss": 0.6741, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.4507445934656397, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6436, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.44973384212634937, + "learning_rate": 3.077071725875116e-05, + "loss": 0.6943, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.3542486429343615, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6172, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.483158545603061, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.7324, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.4443601420717475, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6322, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.39815518075615547, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.6544, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.43538466982005025, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6692, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.3987085128336364, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.6337, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4867568989774928, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6384, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.435120991086208, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.6758, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.44316051394587086, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6631, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.45090996634509384, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.6656, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.3738596205343436, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6349, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.43872253842178166, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.6305, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3890988139091703, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6524, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.43589107655422343, + "learning_rate": 2.819819423336775e-05, + "loss": 0.6641, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.48641179767435216, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6973, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.4701137681040988, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.6249, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.41946748101231746, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6626, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.4308555349400613, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.6501, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.3868020833047252, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6524, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.4180293930043457, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6858, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.3989146936164297, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6585, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.409696477744784, + "learning_rate": 2.677041764010988e-05, + "loss": 0.6544, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.39017189335418073, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6374, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.40924851230332643, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.6235, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.41369897553900026, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6297, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.35413877056139925, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.7408, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.4200042956105686, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7043, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.37583423368129587, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.575, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.44458425127920537, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6365, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.36979499734209953, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.6411, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.36321911943415197, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.5964, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.4190613289113207, + "learning_rate": 2.503004759861258e-05, + "loss": 0.6236, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.41415344820995503, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6581, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.4235017481418864, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.6457, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.5429098905663646, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6682, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.37140274130759376, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.6323, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3995966583247825, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6536, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.3845667540435273, + "learning_rate": 2.400992893100822e-05, + "loss": 0.6479, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.503563493811046, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7002, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.48271921958958586, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.6357, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.4063399200880828, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6684, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.4059102196974217, + "learning_rate": 2.334004587234717e-05, + "loss": 0.6512, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.37060607282625513, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6346, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.3571634773263615, + "learning_rate": 2.300819024631603e-05, + "loss": 0.6243, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4632222917295501, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6473, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.38257054846317295, + "learning_rate": 2.26784037992395e-05, + "loss": 0.677, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.4856941237215944, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6743, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.38315673385655835, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.5963, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3710499452465796, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.616, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.40594033576649546, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.6515, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.39039736546857506, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6275, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.3598234016005934, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.5968, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3397120750449489, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6329, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.3940077109393743, + "learning_rate": 2.138012622361689e-05, + "loss": 0.6749, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3887730184910833, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6352, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.4228058902082928, + "learning_rate": 2.106081749751897e-05, + "loss": 0.6594, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3505931888428044, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6448, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.46031606701165484, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.7063, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4807742352290222, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6369, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.4122791741476168, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.6459, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.3689588926242668, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6087, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.3883132360963373, + "learning_rate": 2.011565445123711e-05, + "loss": 0.636, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.42706209168848325, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6331, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.3781175481002125, + "learning_rate": 1.980488270378612e-05, + "loss": 0.6368, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5456382242501266, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6248, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.4037859656966116, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.6351, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.6203428892266306, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.8087, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.42159889631433856, + "learning_rate": 1.918981330958678e-05, + "loss": 0.6825, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4586925573580974, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6819, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.3816846708998703, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.6365, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.41334639135666074, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7254, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.4045931207610921, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.6933, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.7938660736245234, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6161, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.3798388450210105, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.6537, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.398486259721519, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6353, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.3924844708387776, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.6127, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3704378021683162, + "learning_rate": 1.783776873795994e-05, + "loss": 0.641, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.43102170150852875, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.6408, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3974103484818911, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6454, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.4046789441586394, + "learning_rate": 1.739698775823442e-05, + "loss": 0.6832, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.5029288780877051, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6854, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.4306683847026681, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.6454, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.36919468007704176, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6881, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.4696102188764594, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.6854, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.43331871809502737, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.7095, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.415549264948765, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.6591, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.3259761160426885, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.5486, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.4730303904496809, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.7071, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.39164646897654865, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6343, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.401940831835664, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.6552, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.4131804273551023, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7025, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.36578940848087904, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.6048, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.43494772273468696, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6574, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.3413079186978449, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.5834, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.3702228926158319, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6724, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.35894623325969405, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.6094, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3832707585161335, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6574, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.4848049922814122, + "learning_rate": 1.485810737340767e-05, + "loss": 0.6712, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.4371107376549497, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6892, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.4247444084248974, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.6255, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.41743798972209106, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6532, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.43356762665383475, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.6294, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3745596848118526, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.609, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.5161669674957426, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.6717, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.40106947707116897, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6496, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.37874436034022985, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.6408, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.40967781910183637, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6272, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.4502051746466981, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.6752, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.41409654078712216, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6559, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.4024007134875258, + "learning_rate": 1.326814704364262e-05, + "loss": 0.6149, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.42202386062247166, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6933, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.34140954461432244, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.6667, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.35743381773303684, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.621, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.38721575694378707, + "learning_rate": 1.275673273546758e-05, + "loss": 0.603, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.4745584619884366, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6752, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.39580118702239475, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.6712, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.41660875623574883, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6059, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.5109707410202864, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.7445, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.4272723584555947, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6213, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.39866401576252947, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.6749, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.4140133323335469, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6834, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.4310592151251458, + "learning_rate": 1.176209418012495e-05, + "loss": 0.6687, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.39700840989987757, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6731, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.41815502073714694, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.6177, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.46605250253452, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.5856, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.43347330875978707, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.6567, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.4088890928128398, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.7008, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.3982232717283411, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.6431, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3379945503748024, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.5687, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.6047215426337988, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.6368, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.38961219633009386, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6473, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.45536044577956303, + "learning_rate": 1.057219974130903e-05, + "loss": 0.6487, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.39258615219014364, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6526, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.42327691151854674, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.6143, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4330882506423897, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6514, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.42339403515666846, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.6148, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3982875661057225, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6447, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.4259757936497943, + "learning_rate": 9.887052838721322e-06, + "loss": 0.611, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.37383392180131153, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6141, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.4245333502093859, + "learning_rate": 9.663506046162985e-06, + "loss": 0.6965, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.5487012845068449, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6283, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.5532168865592939, + "learning_rate": 9.44238707511862e-06, + "loss": 0.6066, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.39103516028749713, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6187, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.44356388628846, + "learning_rate": 9.22370186822965e-06, + "loss": 0.6503, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.48139483872715877, + "learning_rate": 9.115273765538202e-06, + "loss": 0.5831, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.38181658336917085, + "learning_rate": 9.0074563027294e-06, + "loss": 0.658, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.4386022855137955, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6995, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.3901008769152281, + "learning_rate": 8.79365619028507e-06, + "loss": 0.6704, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4572589949870747, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6376, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.4531364663828187, + "learning_rate": 8.582307276841462e-06, + "loss": 0.6841, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.6513861861854793, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6916, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.4595477594709598, + "learning_rate": 8.37341524246672e-06, + "loss": 0.6351, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3872826193208302, + "learning_rate": 8.269892311900696e-06, + "loss": 0.66, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.3634488772721573, + "learning_rate": 8.166985701199582e-06, + "loss": 0.6141, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.4031911965288131, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6401, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.4016344899493216, + "learning_rate": 7.963024200898462e-06, + "loss": 0.6671, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4475939413873218, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6833, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.37003724677834593, + "learning_rate": 7.761536223092458e-06, + "loss": 0.5828, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.42351050004449026, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6609, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.41176807685505945, + "learning_rate": 7.562527182833978e-06, + "loss": 0.6423, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.44785618534091903, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7489, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.36977653560254525, + "learning_rate": 7.366002428553153e-06, + "loss": 0.5788, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.37308227584401626, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6358, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.3550084968253862, + "learning_rate": 7.171967241914224e-06, + "loss": 0.6017, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.41228005753397967, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6555, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.4114956714467532, + "learning_rate": 6.980426837673437e-06, + "loss": 0.6754, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3619490877691349, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5776, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.5093977449239254, + "learning_rate": 6.791386363539065e-06, + "loss": 0.6357, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.41458795501113116, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6342, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.44181071205391537, + "learning_rate": 6.604850900032955e-06, + "loss": 0.7112, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.44258110359726704, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6001, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.4262379828710342, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6599, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3615813743871014, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6202, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.4118062024955257, + "learning_rate": 6.239314990243339e-06, + "loss": 0.641, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3724427948084534, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6596, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.4357527971934326, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.658, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.38569246848606425, + "learning_rate": 5.971775505458444e-06, + "loss": 0.5852, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.42228534505779775, + "learning_rate": 5.883858403607967e-06, + "loss": 0.6388, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.36227443102893414, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6094, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.4331382548215056, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.6084, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.7345493278756747, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6157, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.38239616006000005, + "learning_rate": 5.538519351897575e-06, + "loss": 0.6536, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.417734215760906, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6337, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.38428949678475666, + "learning_rate": 5.369655545525909e-06, + "loss": 0.6168, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.4017311026883857, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6786, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.39124216968664244, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.6254, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.44134178764982895, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7357, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.44013606406112815, + "learning_rate": 5.039562062965508e-06, + "loss": 0.6511, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4143382812512047, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6132, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.37524037136107635, + "learning_rate": 4.87834125814235e-06, + "loss": 0.6601, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.3676822009982106, + "learning_rate": 4.798689246727006e-06, + "loss": 0.5757, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.4578191014129421, + "learning_rate": 4.719676877632639e-06, + "loss": 0.6736, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4277201145139041, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6259, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.3953872212086165, + "learning_rate": 4.563573185591219e-06, + "loss": 0.6443, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.35997835055017297, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6642, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.4444962679689668, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.6286, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4615731175022121, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6951, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.4298474182913664, + "learning_rate": 4.259064579323302e-06, + "loss": 0.697, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4581268475559863, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6712, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.39428803622226477, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.6479, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.4222497647461776, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6695, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.36614219046640323, + "learning_rate": 3.964848174174541e-06, + "loss": 0.6076, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.41210763713885834, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6477, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.4537911864094117, + "learning_rate": 3.821609474213983e-06, + "loss": 0.6246, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.39606692867099963, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6843, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.39611050158265976, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.6274, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.43727607037447086, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6375, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.4101542675685551, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.5848, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4193003486528778, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6676, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.4134121549238326, + "learning_rate": 3.40741737109318e-06, + "loss": 0.7118, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.4212007528436988, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6537, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.42379361189297626, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.6256, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5180894594898717, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7122, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.47307211912432073, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.6683, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.43413757223825766, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6657, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.3895483494241512, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.6266, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3829929019680482, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6368, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.36524539885154184, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.6319, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.40304824129876693, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.7264, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.3875386822649642, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6186, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.43477554408887387, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6688, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.401224756273508, + "learning_rate": 2.649217248223468e-06, + "loss": 0.6272, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.40867393736243735, + "learning_rate": 2.590275647868867e-06, + "loss": 0.606, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.4026384489112317, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6251, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.38630138834839706, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6321, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.4304460037135496, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.5856, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.437848733905509, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6794, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.4005311104954899, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.6497, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4199134531108724, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6099, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.3979361042617356, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.6798, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.386354953580877, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5612, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.404254564584727, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.6745, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.42490876854613235, + "learning_rate": 2.036919225091827e-06, + "loss": 0.5576, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.47076547735719737, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.6711, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.38845126740707286, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6826, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.4269692732958586, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.6626, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3392421835390835, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.5744, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.42102143136980824, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.6317, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3860964066759228, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6548, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.43600753519657165, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6195, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.42991034950979723, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6245, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.5136795950746355, + "learning_rate": 1.595161589389449e-06, + "loss": 0.6429, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.3936575876555075, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6021, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.42876218271277017, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.6478, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3946883037955344, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6055, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.35540431096006864, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.6307, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.4403763082610581, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6056, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.49814568194496855, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.734, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.41978407244414206, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6491, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.41463671705550975, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.6125, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.4117919435091052, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6596, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.4339789880833592, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.629, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.5259979050287945, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6626, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.37344303123801154, + "learning_rate": 1.089491988176017e-06, + "loss": 0.629, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.3790064804687914, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.5889, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.45761397831864875, + "learning_rate": 1.014505010326583e-06, + "loss": 0.6793, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4289257270302121, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6444, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.38046977543975785, + "learning_rate": 9.421782985976068e-07, + "loss": 0.6396, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.3951377232346646, + "learning_rate": 9.070131527609604e-07, + "loss": 0.68, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.4212853470737934, + "learning_rate": 8.725137967920738e-07, + "loss": 0.6226, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.48858763273052036, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6551, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.4135828243426833, + "learning_rate": 8.055133771652345e-07, + "loss": 0.6525, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3847938876096351, + "learning_rate": 7.730127636723539e-07, + "loss": 0.5939, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.38890861500471724, + "learning_rate": 7.411788403743237e-07, + "loss": 0.5691, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4136033176045734, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6644, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.6868547482667599, + "learning_rate": 6.7951191543012e-07, + "loss": 0.8096, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.459762749444286, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6286, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.42440389461679645, + "learning_rate": 6.205142596505176e-07, + "loss": 0.6463, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.39160122224974353, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6527, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.39897606690271464, + "learning_rate": 5.64187458615939e-07, + "loss": 0.5795, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.37557030555380766, + "learning_rate": 5.370261044956971e-07, + "loss": 0.5773, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.4339982295309075, + "learning_rate": 5.105330261267916e-07, + "loss": 0.6233, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.39570577798208106, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6276, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.3693545088566975, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.6218, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4255677202256316, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.655, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.3815337333068184, + "learning_rate": 4.112469628438365e-07, + "loss": 0.6685, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.40631620283837727, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6269, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.5140202578119795, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.705, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4097602726401862, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7011, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.356559194773096, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.5875, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.47372925471991206, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6191, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.40884250486595014, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.6747, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.35824242354732977, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.5815, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.4188952895360446, + "learning_rate": 2.448018893333681e-07, + "loss": 0.6485, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4248652087397519, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.685, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.4474075111691391, + "learning_rate": 2.098903854912515e-07, + "loss": 0.7218, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3928266375173183, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6051, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.456892325397415, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.607, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.394065013835043, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6122, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.38099445169011364, + "learning_rate": 1.481139151579991e-07, + "loss": 0.6408, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.3616810806947919, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.588, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.3464251650455377, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6282, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4305910204395361, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6426, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.4601820483165333, + "learning_rate": 9.707157531134713e-08, + "loss": 0.6547, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.36916695811479766, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6605, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.38115261332943323, + "learning_rate": 7.557746412468758e-08, + "loss": 0.6157, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3988189730753151, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6277, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.38448671069725276, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6408, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.44966188238655397, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6595, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.3983863858624136, + "learning_rate": 4.064624751394242e-08, + "loss": 0.6776, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.38956811671641306, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6073, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.3740991924737241, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.5863, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4382104532354976, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6068, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.44523308215970514, + "learning_rate": 1.646071422083395e-08, + "loss": 0.626, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4050514605095301, + "learning_rate": 1.209367398504746e-08, + "loss": 0.566, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.44418607390339454, + "learning_rate": 8.398436437317969e-09, + "loss": 0.7074, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4078146301957918, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6424, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.5548149284659238, + "learning_rate": 3.023464202944748e-09, + "loss": 0.7001, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.46736439462345836, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.7094, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.4518059608864925, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.6831, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.4425256887616959, + "learning_rate": 0.0, + "loss": 0.6108, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1092507233615872.0, + "train_loss": 0.7165939631938935, + "train_runtime": 19654.796, + "train_samples_per_second": 1.018, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1092507233615872.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7aacd364b6a2e06745417f97963fbf5ef1bd654f --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "q_proj", + "down_proj", + "o_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..621bceadcc0722ba6a274d73b215dc7505e0fb0b --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4e77648b06a232b0f637629669895ef414878adeb2c1e242ee78c4c1e8c4384 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..f558ae91a282fc45f2aa3944bed7ed032a3a0530 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcd0d75dc06578ee9f1adc8d554a420f8d979fea00ed1483f42202a5a4597b6f +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3e2db56473d5b1d17f4975876e3d6e95c36d3fc8 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.9040188214118176, + "learning_rate": 5.263157894736842e-06, + "loss": 1.4158, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.8074281761096858, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.276, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 0.9202532471159965, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.3454, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9621776164972817, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4377, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.8873872672858544, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.4082, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8203039349084842, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3406, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.808691922577677, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.2893, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 1.1192000416783763, + "learning_rate": 4.210526315789474e-05, + "loss": 1.2639, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.9533747850668197, + "learning_rate": 4.736842105263158e-05, + "loss": 1.0958, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 1.5229297009363443, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.1333, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.8190964064118581, + "learning_rate": 5.789473684210527e-05, + "loss": 0.9881, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6678239522256918, + "learning_rate": 6.31578947368421e-05, + "loss": 0.9268, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.7765669885509979, + "learning_rate": 6.842105263157895e-05, + "loss": 1.0071, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7902060939230818, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9962, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.6222902380931187, + "learning_rate": 7.894736842105263e-05, + "loss": 0.8933, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6276138188304536, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9499, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.504029875044396, + "learning_rate": 8.947368421052632e-05, + "loss": 0.8542, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5882045078003736, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9945, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.6067680318757338, + "learning_rate": 0.0001, + "loss": 0.9622, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.518887948143142, + "learning_rate": 0.00010526315789473685, + "loss": 0.8837, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.5203264428776911, + "learning_rate": 0.0001105263157894737, + "loss": 0.8789, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.47424481517796524, + "learning_rate": 0.00011578947368421053, + "loss": 0.8534, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.4941053704203212, + "learning_rate": 0.00012105263157894738, + "loss": 0.8618, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5818901801269433, + "learning_rate": 0.0001263157894736842, + "loss": 0.8146, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.5911189948900281, + "learning_rate": 0.00013157894736842108, + "loss": 0.9155, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.48524617173345874, + "learning_rate": 0.0001368421052631579, + "loss": 0.8631, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.5138017758711481, + "learning_rate": 0.00014210526315789474, + "loss": 0.8625, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5764785783785729, + "learning_rate": 0.00014736842105263158, + "loss": 0.8924, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.5617563618701408, + "learning_rate": 0.00015263157894736845, + "loss": 0.9154, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.48801519614793604, + "learning_rate": 0.00015789473684210527, + "loss": 0.9098, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.5886665769080431, + "learning_rate": 0.0001631578947368421, + "loss": 0.8731, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5038223806917298, + "learning_rate": 0.00016842105263157895, + "loss": 0.8437, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.4910317511980981, + "learning_rate": 0.0001736842105263158, + "loss": 0.8726, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.4970925346086658, + "learning_rate": 0.00017894736842105264, + "loss": 0.8247, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.4783118687921105, + "learning_rate": 0.00018421052631578948, + "loss": 0.8158, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.57283183166534, + "learning_rate": 0.00018947368421052632, + "loss": 0.8997, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.4869656964368314, + "learning_rate": 0.00019473684210526317, + "loss": 0.8427, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.51038300513493, + "learning_rate": 0.0002, + "loss": 0.9406, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.4738312757391544, + "learning_rate": 0.00019999966405802826, + "loss": 0.7848, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.48785244711182263, + "learning_rate": 0.00019999865623437013, + "loss": 0.8485, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.48335867509192165, + "learning_rate": 0.00019999697653579705, + "loss": 0.8593, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.4371580297482496, + "learning_rate": 0.00019999462497359466, + "loss": 0.7799, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.4848757194195308, + "learning_rate": 0.0001999916015635627, + "loss": 0.8057, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5953297448372896, + "learning_rate": 0.00019998790632601496, + "loss": 0.9029, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.4832215495190979, + "learning_rate": 0.00019998353928577919, + "loss": 0.9057, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.472982312319888, + "learning_rate": 0.0001999785004721968, + "loss": 0.8817, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.5515206751669125, + "learning_rate": 0.0001999727899191228, + "loss": 0.8644, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.504088694158904, + "learning_rate": 0.00019996640766492543, + "loss": 0.914, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.4855141657192389, + "learning_rate": 0.00019995935375248606, + "loss": 0.8794, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.45174233023081406, + "learning_rate": 0.00019995162822919883, + "loss": 0.8209, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.5366171085546535, + "learning_rate": 0.00019994323114697022, + "loss": 0.8341, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.416867725937757, + "learning_rate": 0.00019993416256221895, + "loss": 0.8277, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.493119369029618, + "learning_rate": 0.0001999244225358753, + "loss": 0.8414, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.45936334657120104, + "learning_rate": 0.00019991401113338104, + "loss": 0.8536, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.4925848121650304, + "learning_rate": 0.00019990292842468868, + "loss": 0.8257, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5090191041405892, + "learning_rate": 0.00019989117448426108, + "loss": 0.8029, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.4610077140549898, + "learning_rate": 0.0001998787493910712, + "loss": 0.802, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.45304790763425, + "learning_rate": 0.00019986565322860115, + "loss": 0.7792, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.4851067386832999, + "learning_rate": 0.000199851886084842, + "loss": 0.8257, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.49386766283262384, + "learning_rate": 0.00019983744805229296, + "loss": 0.8654, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.5778003128858153, + "learning_rate": 0.00019982233922796085, + "loss": 0.827, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5982362485307835, + "learning_rate": 0.00019980655971335945, + "loss": 0.8657, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.4533742372264301, + "learning_rate": 0.00019979010961450878, + "loss": 0.7645, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.43785735755869126, + "learning_rate": 0.00019977298904193437, + "loss": 0.7817, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.5031107349688599, + "learning_rate": 0.00019975519811066663, + "loss": 0.8511, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4266434125759772, + "learning_rate": 0.00019973673694024, + "loss": 0.8479, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.7435323722682388, + "learning_rate": 0.0001997176056546921, + "loss": 0.8925, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5172299270586722, + "learning_rate": 0.00019969780438256293, + "loss": 0.7584, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.4887792570599819, + "learning_rate": 0.0001996773332568941, + "loss": 0.8147, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.47558116813678436, + "learning_rate": 0.0001996561924152278, + "loss": 0.928, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.4755402402219778, + "learning_rate": 0.00019963438199960599, + "loss": 0.8153, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.43608674343070436, + "learning_rate": 0.0001996119021565693, + "loss": 0.7996, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.4560152584275628, + "learning_rate": 0.00019958875303715615, + "loss": 0.7683, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.552602123722438, + "learning_rate": 0.0001995649347969019, + "loss": 0.7999, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.5021024404811664, + "learning_rate": 0.0001995404475958373, + "loss": 0.8828, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.45265727595403926, + "learning_rate": 0.00019951529159848805, + "loss": 0.8004, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.4703802428678396, + "learning_rate": 0.0001994894669738732, + "loss": 0.8478, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4674731530607759, + "learning_rate": 0.00019946297389550433, + "loss": 0.7384, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.4410723711791221, + "learning_rate": 0.0001994358125413841, + "loss": 0.7545, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.41824527148519003, + "learning_rate": 0.00019940798309400526, + "loss": 0.7648, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.45545971341724595, + "learning_rate": 0.0001993794857403495, + "loss": 0.7813, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5004966730084717, + "learning_rate": 0.0001993503206718859, + "loss": 0.8505, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.4760663850299163, + "learning_rate": 0.0001993204880845699, + "loss": 0.8244, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4882789811829889, + "learning_rate": 0.00019928998817884182, + "loss": 0.7859, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.45758092600536937, + "learning_rate": 0.00019925882115962568, + "loss": 0.7701, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.45489206820776945, + "learning_rate": 0.00019922698723632767, + "loss": 0.7703, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.4953973875429509, + "learning_rate": 0.00019919448662283478, + "loss": 0.7997, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4706752206269874, + "learning_rate": 0.00019916131953751342, + "loss": 0.7238, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.5307930125586854, + "learning_rate": 0.00019912748620320794, + "loss": 0.8249, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.49981665350066634, + "learning_rate": 0.00019909298684723904, + "loss": 0.8521, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.5564656093618034, + "learning_rate": 0.00019905782170140238, + "loss": 0.7512, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.44834150164979303, + "learning_rate": 0.00019902199100196697, + "loss": 0.7718, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.5159727888050307, + "learning_rate": 0.00019898549498967343, + "loss": 0.8076, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4787236389531373, + "learning_rate": 0.00019894833390973266, + "loss": 0.7917, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.492894818998774, + "learning_rate": 0.000198910508011824, + "loss": 0.7781, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4292074077063288, + "learning_rate": 0.00019887201755009357, + "loss": 0.7411, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.47839784486091336, + "learning_rate": 0.00019883286278315262, + "loss": 0.8444, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5133482397538831, + "learning_rate": 0.0001987930439740757, + "loss": 0.8673, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.42033885728008547, + "learning_rate": 0.00019875256139039902, + "loss": 0.7766, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.46739787930042637, + "learning_rate": 0.00019871141530411853, + "loss": 0.8283, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.4307240442868828, + "learning_rate": 0.00019866960599168826, + "loss": 0.7975, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.49899824851338265, + "learning_rate": 0.0001986271337340182, + "loss": 0.8347, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.44772959357290676, + "learning_rate": 0.0001985839988164726, + "loss": 0.7913, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3971504789375786, + "learning_rate": 0.00019854020152886814, + "loss": 0.7069, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.43937587550035584, + "learning_rate": 0.00019849574216547171, + "loss": 0.7717, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.522585248895453, + "learning_rate": 0.0001984506210249986, + "loss": 0.788, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.4876458104618733, + "learning_rate": 0.00019840483841061058, + "loss": 0.7161, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.44230394173071313, + "learning_rate": 0.00019835839462991361, + "loss": 0.8522, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.5118965668326318, + "learning_rate": 0.00019831128999495606, + "loss": 0.8358, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.4307903664861674, + "learning_rate": 0.00019826352482222638, + "loss": 0.7655, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.40542527217686575, + "learning_rate": 0.0001982150994326511, + "loss": 0.7809, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4575152055233773, + "learning_rate": 0.00019816601415159263, + "loss": 0.7818, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.4492476210335051, + "learning_rate": 0.0001981162693088471, + "loss": 0.7938, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.42168461140796104, + "learning_rate": 0.0001980658652386421, + "loss": 0.7822, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.48686512745299765, + "learning_rate": 0.0001980148022796345, + "loss": 0.7824, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.40701186895737285, + "learning_rate": 0.00019796308077490817, + "loss": 0.7672, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.5405954671628467, + "learning_rate": 0.00019791070107197153, + "loss": 0.8782, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4648181416591156, + "learning_rate": 0.00019785766352275542, + "loss": 0.7651, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.47812112860109685, + "learning_rate": 0.0001978039684836106, + "loss": 0.7482, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.4649791025079638, + "learning_rate": 0.00019774961631530545, + "loss": 0.8236, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.41964331638159746, + "learning_rate": 0.0001976946073830234, + "loss": 0.7619, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.47113555893383857, + "learning_rate": 0.00019763894205636072, + "loss": 0.8301, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.39594357398635965, + "learning_rate": 0.00019758262070932375, + "loss": 0.7286, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5197786058531375, + "learning_rate": 0.00019752564372032657, + "loss": 0.871, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.40914468391436765, + "learning_rate": 0.00019746801147218842, + "loss": 0.7935, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.48421063496832645, + "learning_rate": 0.00019740972435213115, + "loss": 0.7757, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.40375094393415706, + "learning_rate": 0.00019735078275177654, + "loss": 0.7551, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.40718004312825856, + "learning_rate": 0.00019729118706714375, + "loss": 0.8125, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.4812014892417162, + "learning_rate": 0.00019723093769864663, + "loss": 0.7058, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.4953728745419649, + "learning_rate": 0.00019717003505109095, + "loss": 0.8233, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.4700253036597113, + "learning_rate": 0.0001971084795336719, + "loss": 0.7956, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5215964588468351, + "learning_rate": 0.00019704627155997108, + "loss": 0.844, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.4724685471889322, + "learning_rate": 0.00019698341154795389, + "loss": 0.7369, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.4834071598595188, + "learning_rate": 0.00019691989991996663, + "loss": 0.8177, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.507433079199277, + "learning_rate": 0.00019685573710273376, + "loss": 0.8305, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4728754801754339, + "learning_rate": 0.0001967909235273549, + "loss": 0.7487, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.48745482688410646, + "learning_rate": 0.00019672545962930215, + "loss": 0.8188, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.4642048721871913, + "learning_rate": 0.00019665934584841682, + "loss": 0.7658, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.5378006281898445, + "learning_rate": 0.00019659258262890683, + "loss": 0.8066, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.41028452787634606, + "learning_rate": 0.00019652517041934356, + "loss": 0.7391, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.5577951692374868, + "learning_rate": 0.00019645710967265882, + "loss": 0.7467, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.44260415171980666, + "learning_rate": 0.00019638840084614182, + "loss": 0.7558, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.543371139901953, + "learning_rate": 0.00019631904440143612, + "loss": 0.8466, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.458721343297423, + "learning_rate": 0.00019624904080453655, + "loss": 0.8494, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.5390806782860094, + "learning_rate": 0.00019617839052578603, + "loss": 0.8402, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.48744932087130155, + "learning_rate": 0.00019610709403987246, + "loss": 0.7678, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.504789130409822, + "learning_rate": 0.0001960351518258255, + "loss": 0.7891, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4465558048903934, + "learning_rate": 0.00019596256436701324, + "loss": 0.8409, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.3842798692320988, + "learning_rate": 0.00019588933215113926, + "loss": 0.6798, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.4945421272518994, + "learning_rate": 0.000195815455670239, + "loss": 0.7774, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.43464997145903267, + "learning_rate": 0.00019574093542067673, + "loss": 0.7287, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.48187487570821935, + "learning_rate": 0.00019566577190314197, + "loss": 0.783, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.42590267799494497, + "learning_rate": 0.0001955899656226464, + "loss": 0.7811, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4476569948112045, + "learning_rate": 0.0001955135170885202, + "loss": 0.7811, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.4746596130009304, + "learning_rate": 0.0001954364268144088, + "loss": 0.809, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.439012305507357, + "learning_rate": 0.00019535869531826937, + "loss": 0.7954, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.4849607584466126, + "learning_rate": 0.00019528032312236736, + "loss": 0.8013, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.43950558594870676, + "learning_rate": 0.00019520131075327298, + "loss": 0.7969, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.49193291403363903, + "learning_rate": 0.00019512165874185767, + "loss": 0.8599, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.47752096536945043, + "learning_rate": 0.00019504136762329047, + "loss": 0.7436, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.5085683059812202, + "learning_rate": 0.0001949604379370345, + "loss": 0.8096, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.48482351023606757, + "learning_rate": 0.00019487887022684336, + "loss": 0.7786, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.45731700507184797, + "learning_rate": 0.00019479666504075736, + "loss": 0.7232, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4881198659574083, + "learning_rate": 0.00019471382293110003, + "loss": 0.8234, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.53339133035444, + "learning_rate": 0.0001946303444544741, + "loss": 0.8353, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5017086808903384, + "learning_rate": 0.00019454623017175812, + "loss": 0.7523, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.4519444173975852, + "learning_rate": 0.00019446148064810242, + "loss": 0.8024, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4368475174874208, + "learning_rate": 0.00019437609645292546, + "loss": 0.7571, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.46015022149254187, + "learning_rate": 0.00019429007815990993, + "loss": 0.7922, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.4526030924546475, + "learning_rate": 0.0001942034263469989, + "loss": 0.7819, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.43878196577602335, + "learning_rate": 0.00019411614159639204, + "loss": 0.7911, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.45829512673345807, + "learning_rate": 0.00019402822449454153, + "loss": 0.7991, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.4673957138089711, + "learning_rate": 0.00019393967563214833, + "loss": 0.8296, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.4570484436862653, + "learning_rate": 0.00019385049560415794, + "loss": 0.8398, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.44614625036883643, + "learning_rate": 0.00019376068500975667, + "loss": 0.7416, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4301659125357434, + "learning_rate": 0.00019367024445236754, + "loss": 0.7532, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.4915260373013018, + "learning_rate": 0.000193579174539646, + "loss": 0.8246, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4588067567139292, + "learning_rate": 0.00019348747588347637, + "loss": 0.7283, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.47831915684525106, + "learning_rate": 0.00019339514909996706, + "loss": 0.7658, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.44874842746039945, + "learning_rate": 0.00019330219480944694, + "loss": 0.862, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.5679768293469264, + "learning_rate": 0.00019320861363646095, + "loss": 0.7803, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.5120399489784894, + "learning_rate": 0.00019311440620976597, + "loss": 0.8634, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.4484584521277922, + "learning_rate": 0.00019301957316232658, + "loss": 0.7415, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4613681670976354, + "learning_rate": 0.0001929241151313108, + "loss": 0.8105, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.4423279781155262, + "learning_rate": 0.0001928280327580858, + "loss": 0.7747, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.45490201656129764, + "learning_rate": 0.00019273132668821364, + "loss": 0.7403, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.4220401525170389, + "learning_rate": 0.00019263399757144683, + "loss": 0.815, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5194451614177903, + "learning_rate": 0.00019253604606172417, + "loss": 0.8446, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.45210322284078563, + "learning_rate": 0.000192437472817166, + "loss": 0.7791, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.46050866955640335, + "learning_rate": 0.00019233827850007027, + "loss": 0.7157, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.4235505136533195, + "learning_rate": 0.00019223846377690754, + "loss": 0.7635, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4703681211642257, + "learning_rate": 0.00019213802931831696, + "loss": 0.8228, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.44136920565020055, + "learning_rate": 0.00019203697579910154, + "loss": 0.7433, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.45519261099119485, + "learning_rate": 0.00019193530389822363, + "loss": 0.7425, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.5383156346157177, + "learning_rate": 0.00019183301429880043, + "loss": 0.8363, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4649654247908177, + "learning_rate": 0.00019173010768809933, + "loss": 0.7671, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.4028443817499883, + "learning_rate": 0.00019162658475753327, + "loss": 0.7348, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4189554064231662, + "learning_rate": 0.0001915224462026563, + "loss": 0.7221, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.43893145761291824, + "learning_rate": 0.00019141769272315858, + "loss": 0.7863, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.4383750996085504, + "learning_rate": 0.00019131232502286188, + "loss": 0.8282, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.35778735409711204, + "learning_rate": 0.00019120634380971496, + "loss": 0.6993, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4927470353487211, + "learning_rate": 0.0001910997497957885, + "loss": 0.7192, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.4608492380594332, + "learning_rate": 0.0001909925436972706, + "loss": 0.7428, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.48397589855498346, + "learning_rate": 0.00019088472623446183, + "loss": 0.8228, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.45123776740033, + "learning_rate": 0.00019077629813177036, + "loss": 0.7836, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.48630214364118873, + "learning_rate": 0.00019066726011770726, + "loss": 0.7871, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.48414963185872245, + "learning_rate": 0.00019055761292488142, + "loss": 0.8581, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.43828562684155925, + "learning_rate": 0.0001904473572899947, + "loss": 0.7769, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.47294900525907246, + "learning_rate": 0.00019033649395383702, + "loss": 0.7628, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.3897177476488877, + "learning_rate": 0.00019022502366128135, + "loss": 0.766, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.434877741565264, + "learning_rate": 0.00019011294716127867, + "loss": 0.7797, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4168764136828289, + "learning_rate": 0.00019000026520685302, + "loss": 0.7935, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.5733400243587182, + "learning_rate": 0.0001898869785550963, + "loss": 0.8599, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4670533609795901, + "learning_rate": 0.0001897730879671634, + "loss": 0.7234, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.3910677038914766, + "learning_rate": 0.00018965859420826684, + "loss": 0.6943, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4696248418112437, + "learning_rate": 0.00018954349804767184, + "loss": 0.8311, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.4685919639322512, + "learning_rate": 0.00018942780025869098, + "loss": 0.8238, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4068952644051147, + "learning_rate": 0.00018931150161867916, + "loss": 0.7628, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.5001897403160873, + "learning_rate": 0.00018919460290902826, + "loss": 0.7819, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.49498086830282373, + "learning_rate": 0.00018907710491516199, + "loss": 0.7927, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.46919273381855464, + "learning_rate": 0.0001889590084265304, + "loss": 0.7854, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.46719410797559685, + "learning_rate": 0.0001888403142366049, + "loss": 0.8124, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.41358190476326695, + "learning_rate": 0.0001887210231428727, + "loss": 0.8068, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4048646256150849, + "learning_rate": 0.00018860113594683148, + "loss": 0.7136, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.4645755589249061, + "learning_rate": 0.0001884806534539841, + "loss": 0.7403, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.45204161778965424, + "learning_rate": 0.00018835957647383303, + "loss": 0.7742, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.4296617628050709, + "learning_rate": 0.0001882379058198751, + "loss": 0.72, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.43401567939298147, + "learning_rate": 0.00018811564230959588, + "loss": 0.7885, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.5041044277014657, + "learning_rate": 0.00018799278676446423, + "loss": 0.8149, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.5188659570705662, + "learning_rate": 0.00018786934000992688, + "loss": 0.8609, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.42446039511917316, + "learning_rate": 0.00018774530287540278, + "loss": 0.7654, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.389139862357896, + "learning_rate": 0.00018762067619427746, + "loss": 0.6807, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.4231105336792517, + "learning_rate": 0.00018749546080389757, + "loss": 0.8009, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.44988946913128747, + "learning_rate": 0.00018736965754556528, + "loss": 0.8206, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.43170221354481614, + "learning_rate": 0.00018724326726453244, + "loss": 0.7268, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5343955357192294, + "learning_rate": 0.00018711629080999504, + "loss": 0.8537, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.45997831665191913, + "learning_rate": 0.00018698872903508755, + "loss": 0.8326, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4130366400377277, + "learning_rate": 0.00018686058279687698, + "loss": 0.7269, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.4392305512227261, + "learning_rate": 0.0001867318529563574, + "loss": 0.79, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.4082950983629924, + "learning_rate": 0.00018660254037844388, + "loss": 0.712, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.4291084127655194, + "learning_rate": 0.00018647264593196688, + "loss": 0.7103, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4589004338736792, + "learning_rate": 0.00018634217048966637, + "loss": 0.7601, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.4591406240237622, + "learning_rate": 0.00018621111492818585, + "loss": 0.7697, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4962568753940962, + "learning_rate": 0.0001860794801280666, + "loss": 0.78, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.4134638367406118, + "learning_rate": 0.00018594726697374175, + "loss": 0.7439, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.42922739161294365, + "learning_rate": 0.0001858144763535302, + "loss": 0.7037, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.4258601748130776, + "learning_rate": 0.0001856811091596308, + "loss": 0.8155, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4343439953650062, + "learning_rate": 0.0001855471662881164, + "loss": 0.8443, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.39816417780442004, + "learning_rate": 0.00018541264863892754, + "loss": 0.7369, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.48743224261890594, + "learning_rate": 0.00018527755711586678, + "loss": 0.862, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.4472661555055735, + "learning_rate": 0.00018514189262659235, + "loss": 0.8504, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5496124715565721, + "learning_rate": 0.00018500565608261214, + "loss": 0.785, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.39415142341636117, + "learning_rate": 0.00018486884839927768, + "loss": 0.7165, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.42039212706623386, + "learning_rate": 0.00018473147049577774, + "loss": 0.7382, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.48614478447830717, + "learning_rate": 0.0001845935232951325, + "loss": 0.7797, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.46549521278374123, + "learning_rate": 0.00018445500772418697, + "loss": 0.7051, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.5232250422287944, + "learning_rate": 0.00018431592471360503, + "loss": 0.7975, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.5523566081412623, + "learning_rate": 0.00018417627519786315, + "loss": 0.8275, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.4472619801194902, + "learning_rate": 0.000184036060115244, + "loss": 0.8145, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.4136886310159193, + "learning_rate": 0.00018389528040783012, + "loss": 0.7447, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.4612846975781372, + "learning_rate": 0.00018375393702149787, + "loss": 0.7825, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.4455585318867921, + "learning_rate": 0.00018361203090591071, + "loss": 0.788, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.5064153760953172, + "learning_rate": 0.00018346956301451304, + "loss": 0.807, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4661251981742707, + "learning_rate": 0.00018332653430452376, + "loss": 0.7484, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.3821580778603391, + "learning_rate": 0.00018318294573692985, + "loss": 0.746, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.3831976970751616, + "learning_rate": 0.00018303879827647975, + "loss": 0.7057, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.4022448064622375, + "learning_rate": 0.0001828940928916772, + "loss": 0.7458, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4934838492023005, + "learning_rate": 0.00018274883055477436, + "loss": 0.8417, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.476188751217707, + "learning_rate": 0.00018260301224176558, + "loss": 0.743, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.4567472664299517, + "learning_rate": 0.00018245663893238075, + "loss": 0.7725, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.504907834045924, + "learning_rate": 0.00018230971161007853, + "loss": 0.7827, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4097416674665615, + "learning_rate": 0.00018216223126204007, + "loss": 0.7228, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.4045760812288052, + "learning_rate": 0.00018201419887916214, + "loss": 0.6967, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.4221629374769461, + "learning_rate": 0.00018186561545605054, + "loss": 0.8001, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.44254128881116356, + "learning_rate": 0.00018171648199101346, + "loss": 0.7954, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.44157543567294516, + "learning_rate": 0.00018156679948605467, + "loss": 0.7273, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.3835702807213834, + "learning_rate": 0.00018141656894686689, + "loss": 0.7686, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4289662272715087, + "learning_rate": 0.00018126579138282503, + "loss": 0.6706, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.43635961138793755, + "learning_rate": 0.00018111446780697929, + "loss": 0.747, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.41113903850753764, + "learning_rate": 0.0001809625992360485, + "loss": 0.7771, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.5076809297662583, + "learning_rate": 0.00018081018669041324, + "loss": 0.7254, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4621183740096553, + "learning_rate": 0.00018065723119410884, + "loss": 0.7807, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.4448200360900046, + "learning_rate": 0.00018050373377481878, + "loss": 0.798, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.42688334948529383, + "learning_rate": 0.00018034969546386757, + "loss": 0.7191, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.4427575286048552, + "learning_rate": 0.0001801951172962139, + "loss": 0.7336, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.45215607236247485, + "learning_rate": 0.0001800400003104436, + "loss": 0.7761, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.4216117939443323, + "learning_rate": 0.0001798843455487629, + "loss": 0.7111, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.41841797640236994, + "learning_rate": 0.00017972815405699103, + "loss": 0.7131, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.42424125348843356, + "learning_rate": 0.00017957142688455362, + "loss": 0.7085, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.43822645489543594, + "learning_rate": 0.00017941416508447536, + "loss": 0.7565, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.5014000995540826, + "learning_rate": 0.00017925636971337304, + "loss": 0.7768, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4533902744400488, + "learning_rate": 0.0001790980418314484, + "loss": 0.7577, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.4760129113128871, + "learning_rate": 0.00017893918250248104, + "loss": 0.7964, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.46475141271015147, + "learning_rate": 0.00017877979279382135, + "loss": 0.776, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.44577724078727743, + "learning_rate": 0.00017861987377638312, + "loss": 0.6926, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.46018865828232647, + "learning_rate": 0.0001784594265246366, + "loss": 0.7761, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.4273900429531465, + "learning_rate": 0.0001782984521166011, + "loss": 0.7104, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.48720115137203984, + "learning_rate": 0.0001781369516338378, + "loss": 0.6669, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.43724150243758453, + "learning_rate": 0.00017797492616144256, + "loss": 0.7849, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.4536456547647474, + "learning_rate": 0.00017781237678803847, + "loss": 0.7314, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.41949648012326524, + "learning_rate": 0.00017764930460576866, + "loss": 0.7841, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.5530323965617054, + "learning_rate": 0.000177485710710289, + "loss": 0.7513, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.41496103490168035, + "learning_rate": 0.00017732159620076053, + "loss": 0.7043, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4342150998833833, + "learning_rate": 0.00017715696217984235, + "loss": 0.7713, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.3836574507427057, + "learning_rate": 0.00017699180975368396, + "loss": 0.695, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.4833306604596649, + "learning_rate": 0.00017682614003191807, + "loss": 0.8012, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.48941071903525935, + "learning_rate": 0.00017665995412765285, + "loss": 0.83, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.45546309598972384, + "learning_rate": 0.00017649325315746478, + "loss": 0.677, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.4273286577487821, + "learning_rate": 0.00017632603824139085, + "loss": 0.7741, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.4363221942447534, + "learning_rate": 0.0001761583105029213, + "loss": 0.752, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.4994010117040842, + "learning_rate": 0.0001759900710689918, + "loss": 0.8144, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4341683399826866, + "learning_rate": 0.00017582132106997616, + "loss": 0.6678, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.401530866349554, + "learning_rate": 0.00017565206163967846, + "loss": 0.7257, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.529625527241754, + "learning_rate": 0.00017548229391532572, + "loss": 0.8088, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.39026455230444157, + "learning_rate": 0.00017531201903755994, + "loss": 0.7194, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.42800283073832723, + "learning_rate": 0.00017514123815043074, + "loss": 0.7903, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.3972390258183331, + "learning_rate": 0.00017496995240138744, + "loss": 0.7185, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.3834178092651497, + "learning_rate": 0.00017479816294127152, + "loss": 0.7082, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.4518687861232824, + "learning_rate": 0.00017462587092430875, + "loss": 0.7919, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.5346360451967396, + "learning_rate": 0.0001744530775081015, + "loss": 0.8324, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.46979803699729494, + "learning_rate": 0.00017427978385362112, + "loss": 0.8236, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.43694407342820374, + "learning_rate": 0.0001741059911251997, + "loss": 0.7477, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.5095323025630049, + "learning_rate": 0.0001739317004905227, + "loss": 0.8413, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4968128564994145, + "learning_rate": 0.000173756913120621, + "loss": 0.7885, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.5078776182199096, + "learning_rate": 0.00017358163018986282, + "loss": 0.7603, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4744412438088314, + "learning_rate": 0.00017340585287594604, + "loss": 0.7661, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.4614229691130732, + "learning_rate": 0.00017322958235989016, + "loss": 0.7322, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.46843691236917434, + "learning_rate": 0.0001730528198260285, + "loss": 0.7172, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.4684751645469075, + "learning_rate": 0.00017287556646200018, + "loss": 0.7047, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.46630543381550216, + "learning_rate": 0.00017269782345874203, + "loss": 0.73, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.43745535560275245, + "learning_rate": 0.00017251959201048083, + "loss": 0.7629, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4762020250493979, + "learning_rate": 0.00017234087331472497, + "loss": 0.7868, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.4357948760020524, + "learning_rate": 0.00017216166857225674, + "loss": 0.783, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.436943677523882, + "learning_rate": 0.00017198197898712404, + "loss": 0.7373, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.40932891382179964, + "learning_rate": 0.00017180180576663228, + "loss": 0.7202, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5883861574712437, + "learning_rate": 0.00017162115012133643, + "loss": 0.8278, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.40132402370081177, + "learning_rate": 0.00017144001326503273, + "loss": 0.7263, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4115217805673836, + "learning_rate": 0.00017125839641475072, + "loss": 0.6755, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.4268351110332894, + "learning_rate": 0.00017107630079074478, + "loss": 0.7103, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.4923655416141933, + "learning_rate": 0.00017089372761648616, + "loss": 0.8044, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.4997493249774998, + "learning_rate": 0.00017071067811865476, + "loss": 0.8539, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.48643623176041373, + "learning_rate": 0.00017052715352713075, + "loss": 0.8026, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.4773576785484544, + "learning_rate": 0.00017034315507498635, + "loss": 0.7214, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.39494352193372345, + "learning_rate": 0.00017015868399847768, + "loss": 0.6902, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.46984891638247384, + "learning_rate": 0.00016997374153703625, + "loss": 0.7609, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4333884424966963, + "learning_rate": 0.00016978832893326074, + "loss": 0.6992, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.4040061587158778, + "learning_rate": 0.00016960244743290868, + "loss": 0.6719, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.44559369491003564, + "learning_rate": 0.00016941609828488807, + "loss": 0.6996, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.4549801746358309, + "learning_rate": 0.00016922928274124886, + "loss": 0.7783, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.44001321625949863, + "learning_rate": 0.0001690420020571747, + "loss": 0.7176, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.4639773269310298, + "learning_rate": 0.00016885425749097444, + "loss": 0.8004, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4303955636727868, + "learning_rate": 0.0001686660503040737, + "loss": 0.755, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.4141004937252427, + "learning_rate": 0.00016847738176100632, + "loss": 0.7598, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.43962722102707485, + "learning_rate": 0.00016828825312940592, + "loss": 0.81, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.4162715488430212, + "learning_rate": 0.0001680986656799975, + "loss": 0.6627, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4529540083988486, + "learning_rate": 0.0001679086206865886, + "loss": 0.789, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.42182531250811367, + "learning_rate": 0.00016771811942606108, + "loss": 0.72, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.40127151670189865, + "learning_rate": 0.00016752716317836229, + "loss": 0.734, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.4671780008586571, + "learning_rate": 0.00016733575322649657, + "loss": 0.7381, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.5032627114867887, + "learning_rate": 0.0001671438908565167, + "loss": 0.807, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.44473524675660386, + "learning_rate": 0.00016695157735751513, + "loss": 0.7303, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4162124864646265, + "learning_rate": 0.00016675881402161536, + "loss": 0.7336, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.4449890529639507, + "learning_rate": 0.0001665656021439633, + "loss": 0.7486, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5237231064323881, + "learning_rate": 0.0001663719430227186, + "loss": 0.7471, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.4207160499922226, + "learning_rate": 0.00016617783795904565, + "loss": 0.7261, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4704045263890853, + "learning_rate": 0.00016598328825710533, + "loss": 0.8208, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.43304544107045895, + "learning_rate": 0.00016578829522404583, + "loss": 0.7635, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.39131061001722545, + "learning_rate": 0.000165592860169994, + "loss": 0.7196, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.37808206725966376, + "learning_rate": 0.00016539698440804661, + "loss": 0.7097, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.3743360142983906, + "learning_rate": 0.00016520066925426144, + "loss": 0.7536, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.49000270338630736, + "learning_rate": 0.0001650039160276485, + "loss": 0.7793, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.412541252941152, + "learning_rate": 0.0001648067260501611, + "loss": 0.7385, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.4876990343890208, + "learning_rate": 0.0001646091006466871, + "loss": 0.7508, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.45483122387541086, + "learning_rate": 0.0001644110411450398, + "loss": 0.6547, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.43292404787859134, + "learning_rate": 0.00016421254887594917, + "loss": 0.7429, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.5240154554847967, + "learning_rate": 0.00016401362517305296, + "loss": 0.7897, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.39038384511217455, + "learning_rate": 0.00016381427137288754, + "loss": 0.6865, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.38623020339333136, + "learning_rate": 0.00016361448881487914, + "loss": 0.7633, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.429645339971841, + "learning_rate": 0.0001634142788413346, + "loss": 0.7662, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.43721601515086034, + "learning_rate": 0.00016321364279743266, + "loss": 0.8308, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.43647395057503274, + "learning_rate": 0.00016301258203121462, + "loss": 0.7686, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.40950006672868094, + "learning_rate": 0.0001628110978935756, + "loss": 0.7398, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.4912920895700108, + "learning_rate": 0.00016260919173825508, + "loss": 0.7458, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.6168180200741212, + "learning_rate": 0.00016240686492182804, + "loss": 0.7912, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.41411027495970254, + "learning_rate": 0.00016220411880369601, + "loss": 0.7168, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.4497642149521469, + "learning_rate": 0.00016200095474607753, + "loss": 0.6723, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.4418045102455252, + "learning_rate": 0.00016179737411399926, + "loss": 0.7442, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4373246281997312, + "learning_rate": 0.00016159337827528685, + "loss": 0.7293, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.47325885066264217, + "learning_rate": 0.00016138896860055555, + "loss": 0.7655, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.3807983166517147, + "learning_rate": 0.0001611841464632011, + "loss": 0.6831, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.6715867627560135, + "learning_rate": 0.00016097891323939062, + "loss": 0.7487, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.48371564351939556, + "learning_rate": 0.0001607732703080532, + "loss": 0.7976, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.48130480681229615, + "learning_rate": 0.00016056721905087056, + "loss": 0.7378, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.4617302480330167, + "learning_rate": 0.00016036076085226814, + "loss": 0.7098, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.5267134983219407, + "learning_rate": 0.00016015389709940538, + "loss": 0.7405, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4650556793621377, + "learning_rate": 0.0001599466291821666, + "loss": 0.7808, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.5265641825715518, + "learning_rate": 0.0001597389584931517, + "loss": 0.8145, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.3931228325769163, + "learning_rate": 0.0001595308864276666, + "loss": 0.7726, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.4837066960426388, + "learning_rate": 0.0001593224143837142, + "loss": 0.8093, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.4184883413355477, + "learning_rate": 0.0001591135437619847, + "loss": 0.7748, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.39714913018174774, + "learning_rate": 0.00015890427596584617, + "loss": 0.6945, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4486406663071972, + "learning_rate": 0.0001586946124013354, + "loss": 0.7621, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.4802101093193108, + "learning_rate": 0.00015848455447714822, + "loss": 0.7852, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5632519786978436, + "learning_rate": 0.0001582741036046301, + "loss": 0.8368, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.40690770552546535, + "learning_rate": 0.00015806326119776663, + "loss": 0.7121, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4167844799568287, + "learning_rate": 0.00015785202867317407, + "loss": 0.7582, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.5272911783056048, + "learning_rate": 0.00015764040745008988, + "loss": 0.7908, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4056347008128688, + "learning_rate": 0.00015742839895036305, + "loss": 0.6806, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.4653365044749056, + "learning_rate": 0.00015721600459844468, + "loss": 0.7713, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.5202471557781018, + "learning_rate": 0.00015700322582137827, + "loss": 0.7944, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.41787271318453006, + "learning_rate": 0.00015679006404879033, + "loss": 0.762, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3761029147324706, + "learning_rate": 0.0001565765207128805, + "loss": 0.6588, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.4072236756118215, + "learning_rate": 0.00015636259724841222, + "loss": 0.6799, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.43890168022863474, + "learning_rate": 0.0001561482950927029, + "loss": 0.7598, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.5269865532585807, + "learning_rate": 0.00015593361568561428, + "loss": 0.7183, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5142377830763563, + "learning_rate": 0.00015571856046954285, + "loss": 0.7427, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.39680223133582515, + "learning_rate": 0.0001555031308894101, + "loss": 0.6888, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.38605801338535556, + "learning_rate": 0.00015528732839265272, + "loss": 0.7538, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.4312235067932873, + "learning_rate": 0.0001550711544292131, + "loss": 0.7073, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.47661865931411446, + "learning_rate": 0.0001548546104515294, + "loss": 0.7361, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.378208127701607, + "learning_rate": 0.00015463769791452574, + "loss": 0.7094, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.40054410067166246, + "learning_rate": 0.00015442041827560274, + "loss": 0.7333, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.4465448155069957, + "learning_rate": 0.00015420277299462736, + "loss": 0.7734, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3895583553778457, + "learning_rate": 0.00015398476353392323, + "loss": 0.713, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.4456224885093965, + "learning_rate": 0.00015376639135826107, + "loss": 0.7199, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.3883694852647505, + "learning_rate": 0.00015354765793484834, + "loss": 0.7291, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.39948802584076715, + "learning_rate": 0.00015332856473331978, + "loss": 0.695, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.38137650151076147, + "learning_rate": 0.00015310911322572753, + "loss": 0.7001, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.4668339041021015, + "learning_rate": 0.00015288930488653094, + "loss": 0.7842, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.4535718625573633, + "learning_rate": 0.000152669141192587, + "loss": 0.6747, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.4648477500228524, + "learning_rate": 0.0001524486236231402, + "loss": 0.6815, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.47199304890528454, + "learning_rate": 0.00015222775365981273, + "loss": 0.7611, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.4099428591875097, + "learning_rate": 0.00015200653278659432, + "loss": 0.6938, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4535611989804537, + "learning_rate": 0.00015178496248983254, + "loss": 0.7348, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.5035535828466178, + "learning_rate": 0.00015156304425822267, + "loss": 0.7362, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.42283019750423534, + "learning_rate": 0.00015134077958279765, + "loss": 0.7185, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.4838752327509362, + "learning_rate": 0.00015111816995691809, + "loss": 0.7388, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.42024802280700807, + "learning_rate": 0.00015089521687626243, + "loss": 0.7356, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.4160541380585408, + "learning_rate": 0.00015067192183881658, + "loss": 0.7701, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.3788347031877241, + "learning_rate": 0.000150448286344864, + "loss": 0.7325, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.38735818692185703, + "learning_rate": 0.00015022431189697568, + "loss": 0.6893, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.3597178476469672, + "learning_rate": 0.00015000000000000001, + "loss": 0.6976, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.4178694077843292, + "learning_rate": 0.0001497753521610526, + "loss": 0.7257, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.41761446690467124, + "learning_rate": 0.00014955036988950618, + "loss": 0.7874, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.42679300828782424, + "learning_rate": 0.00014932505469698052, + "loss": 0.7151, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.39175218883611723, + "learning_rate": 0.00014909940809733222, + "loss": 0.686, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.4563357236112232, + "learning_rate": 0.0001488734316066446, + "loss": 0.8581, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.38812383097666997, + "learning_rate": 0.00014864712674321734, + "loss": 0.6779, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.38277783873384447, + "learning_rate": 0.0001484204950275565, + "loss": 0.7104, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.42620266377459964, + "learning_rate": 0.00014819353798236427, + "loss": 0.77, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.43257156599828467, + "learning_rate": 0.00014796625713252848, + "loss": 0.7269, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4049971816820278, + "learning_rate": 0.00014773865400511272, + "loss": 0.6721, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.41157857479213467, + "learning_rate": 0.00014751073012934587, + "loss": 0.7696, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4704422303871149, + "learning_rate": 0.00014728248703661182, + "loss": 0.7603, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.48366105528864356, + "learning_rate": 0.0001470539262604393, + "loss": 0.7488, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3943341335087824, + "learning_rate": 0.00014682504933649144, + "loss": 0.7683, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.4132128850022526, + "learning_rate": 0.00014659585780255556, + "loss": 0.7668, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.541500045682358, + "learning_rate": 0.00014636635319853275, + "loss": 0.8034, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.40760376296989886, + "learning_rate": 0.0001461365370664276, + "loss": 0.7051, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.46584302551593115, + "learning_rate": 0.00014590641095033787, + "loss": 0.7822, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.4537814273231918, + "learning_rate": 0.00014567597639644387, + "loss": 0.7745, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3929636636729761, + "learning_rate": 0.00014544523495299842, + "loss": 0.7179, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.39104933951566373, + "learning_rate": 0.00014521418817031628, + "loss": 0.7321, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.9724579030680056, + "learning_rate": 0.0001449828376007636, + "loss": 0.6767, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.3937525902011488, + "learning_rate": 0.00014475118479874774, + "loss": 0.7209, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.48600652817492457, + "learning_rate": 0.0001445192313207067, + "loss": 0.7702, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.4732081299865097, + "learning_rate": 0.0001442869787250987, + "loss": 0.7576, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.37407767109013385, + "learning_rate": 0.0001440544285723915, + "loss": 0.6783, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.45168072106832896, + "learning_rate": 0.00014382158242505234, + "loss": 0.7194, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.4584354719992922, + "learning_rate": 0.00014358844184753712, + "loss": 0.7765, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.39354936124716017, + "learning_rate": 0.00014335500840627986, + "loss": 0.7156, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.42255891436963455, + "learning_rate": 0.00014312128366968243, + "loss": 0.7169, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.37187942916048466, + "learning_rate": 0.0001428872692081038, + "loss": 0.6497, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.45687994821533223, + "learning_rate": 0.00014265296659384956, + "loss": 0.7141, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.36445282527599093, + "learning_rate": 0.00014241837740116132, + "loss": 0.7537, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4005479130992262, + "learning_rate": 0.00014218350320620624, + "loss": 0.7476, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.46674310582409684, + "learning_rate": 0.00014194834558706632, + "loss": 0.6526, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.4260748722860748, + "learning_rate": 0.0001417129061237278, + "loss": 0.69, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.46437117291237945, + "learning_rate": 0.0001414771863980707, + "loss": 0.6467, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.4415056988395311, + "learning_rate": 0.00014124118799385796, + "loss": 0.736, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.4908767661708374, + "learning_rate": 0.00014100491249672498, + "loss": 0.7134, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.47859834363322507, + "learning_rate": 0.00014076836149416887, + "loss": 0.7785, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.47439178111884583, + "learning_rate": 0.0001405315365755379, + "loss": 0.7899, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4412734441330705, + "learning_rate": 0.0001402944393320206, + "loss": 0.7184, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.431771660931234, + "learning_rate": 0.00014005707135663527, + "loss": 0.7624, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.43896564564751067, + "learning_rate": 0.00013981943424421932, + "loss": 0.7308, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.40217102245700914, + "learning_rate": 0.00013958152959141825, + "loss": 0.7277, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4389039155421663, + "learning_rate": 0.00013934335899667527, + "loss": 0.7001, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.38581682640274634, + "learning_rate": 0.00013910492406022033, + "loss": 0.6617, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.4079475204598973, + "learning_rate": 0.00013886622638405952, + "loss": 0.6675, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.5585268315795652, + "learning_rate": 0.0001386272675719642, + "loss": 0.7166, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.5022464531472589, + "learning_rate": 0.00013838804922946027, + "loss": 0.8057, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.394046028658977, + "learning_rate": 0.00013814857296381728, + "loss": 0.7622, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.44257697534003737, + "learning_rate": 0.00013790884038403795, + "loss": 0.6819, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.38307503044539076, + "learning_rate": 0.00013766885310084688, + "loss": 0.7019, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.41807116882005585, + "learning_rate": 0.00013742861272668012, + "loss": 0.7152, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.45290570075320474, + "learning_rate": 0.00013718812087567414, + "loss": 0.7348, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.3925236397704885, + "learning_rate": 0.00013694737916365517, + "loss": 0.6861, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.45149071314815764, + "learning_rate": 0.000136706389208128, + "loss": 0.6878, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.4071222732548133, + "learning_rate": 0.00013646515262826552, + "loss": 0.7811, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.4421342869463711, + "learning_rate": 0.00013622367104489756, + "loss": 0.7167, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.5760793900115654, + "learning_rate": 0.0001359819460805001, + "loss": 0.7143, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.3872733363864618, + "learning_rate": 0.0001357399793591844, + "loss": 0.7523, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.43872689780567015, + "learning_rate": 0.0001354977725066859, + "loss": 0.7063, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.38939207805743736, + "learning_rate": 0.00013525532715035366, + "loss": 0.7535, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.44352116285817117, + "learning_rate": 0.00013501264491913906, + "loss": 0.7692, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.49678791584464294, + "learning_rate": 0.00013476972744358507, + "loss": 0.7666, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4085049271929804, + "learning_rate": 0.0001345265763558152, + "loss": 0.749, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.40304066590545856, + "learning_rate": 0.00013428319328952253, + "loss": 0.7113, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.43042250935552345, + "learning_rate": 0.00013403957987995882, + "loss": 0.756, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.4147908128865509, + "learning_rate": 0.0001337957377639235, + "loss": 0.725, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.37609010502624507, + "learning_rate": 0.0001335516685797525, + "loss": 0.6754, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.40850172243478794, + "learning_rate": 0.0001333073739673076, + "loss": 0.7368, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.44390614168079706, + "learning_rate": 0.00013306285556796495, + "loss": 0.7222, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.4120377036632598, + "learning_rate": 0.0001328181150246045, + "loss": 0.7087, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.5573398188129507, + "learning_rate": 0.00013257315398159864, + "loss": 0.758, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.4200468433710076, + "learning_rate": 0.00013232797408480127, + "loss": 0.7227, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.38446339270571694, + "learning_rate": 0.00013208257698153677, + "loss": 0.6964, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.4306117384300308, + "learning_rate": 0.00013183696432058888, + "loss": 0.6973, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.40291091663964096, + "learning_rate": 0.00013159113775218964, + "loss": 0.6936, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.4488480953601367, + "learning_rate": 0.00013134509892800822, + "loss": 0.7487, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.5513524541003362, + "learning_rate": 0.00013109884950114007, + "loss": 0.6943, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.40768522673540947, + "learning_rate": 0.00013085239112609547, + "loss": 0.7154, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4077524614993159, + "learning_rate": 0.00013060572545878875, + "loss": 0.707, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.4159916451775266, + "learning_rate": 0.00013035885415652685, + "loss": 0.7144, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.3956708039094935, + "learning_rate": 0.00013011177887799845, + "loss": 0.6597, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.4483404590623498, + "learning_rate": 0.00012986450128326266, + "loss": 0.7989, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.39627321979143537, + "learning_rate": 0.00012961702303373795, + "loss": 0.697, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.4331520131916862, + "learning_rate": 0.00012936934579219094, + "loss": 0.7778, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.4148175928410438, + "learning_rate": 0.00012912147122272523, + "loss": 0.7011, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.4003042927395414, + "learning_rate": 0.00012887340099077024, + "loss": 0.7136, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3720842311567061, + "learning_rate": 0.00012862513676307008, + "loss": 0.689, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.40789792375893974, + "learning_rate": 0.0001283766802076722, + "loss": 0.7181, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.4844489312487811, + "learning_rate": 0.00012812803299391628, + "loss": 0.7413, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.3701294279514779, + "learning_rate": 0.00012787919679242306, + "loss": 0.6729, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4000085713509856, + "learning_rate": 0.00012763017327508305, + "loss": 0.6832, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.4672720327002756, + "learning_rate": 0.00012738096411504522, + "loss": 0.6796, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.44961852455501133, + "learning_rate": 0.0001271315709867059, + "loss": 0.7298, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.41438508218575254, + "learning_rate": 0.00012688199556569753, + "loss": 0.7158, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.4325399033022962, + "learning_rate": 0.00012663223952887723, + "loss": 0.7777, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.5053188784919949, + "learning_rate": 0.0001263823045543158, + "loss": 0.7933, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.38659410950521067, + "learning_rate": 0.00012613219232128608, + "loss": 0.7245, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.38715476585865544, + "learning_rate": 0.00012588190451025207, + "loss": 0.6754, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4747342319133257, + "learning_rate": 0.00012563144280285741, + "loss": 0.7865, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.35438085675639874, + "learning_rate": 0.00012538080888191408, + "loss": 0.681, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.44508203027244037, + "learning_rate": 0.00012513000443139112, + "loss": 0.7308, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.38735676030249233, + "learning_rate": 0.00012487903113640337, + "loss": 0.6808, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3872391895733674, + "learning_rate": 0.00012462789068320017, + "loss": 0.6845, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.3620989909229493, + "learning_rate": 0.00012437658475915377, + "loss": 0.6751, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.43790438606046156, + "learning_rate": 0.00012412511505274844, + "loss": 0.6791, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.48366803986218093, + "learning_rate": 0.00012387348325356874, + "loss": 0.7287, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.42143361862716416, + "learning_rate": 0.00012362169105228826, + "loss": 0.6682, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.4585085971230449, + "learning_rate": 0.00012336974014065844, + "loss": 0.7668, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.3789995384101326, + "learning_rate": 0.000123117632211497, + "loss": 0.6827, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.4429889133762628, + "learning_rate": 0.00012286536895867654, + "loss": 0.6507, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.357270318009767, + "learning_rate": 0.00012261295207711346, + "loss": 0.6593, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.3634325493084814, + "learning_rate": 0.00012236038326275626, + "loss": 0.7329, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4275663523962742, + "learning_rate": 0.0001221076642125742, + "loss": 0.6958, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.4778306138347968, + "learning_rate": 0.00012185479662454595, + "loss": 0.7725, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.41254801675174013, + "learning_rate": 0.00012160178219764837, + "loss": 0.7217, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.44175419694261003, + "learning_rate": 0.00012134862263184467, + "loss": 0.7179, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.4516574575775512, + "learning_rate": 0.00012109531962807332, + "loss": 0.652, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.44152357206659537, + "learning_rate": 0.00012084187488823657, + "loss": 0.7973, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.44010711894728904, + "learning_rate": 0.00012058829011518896, + "loss": 0.7573, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.4030181690076236, + "learning_rate": 0.00012033456701272576, + "loss": 0.6636, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3713858272541694, + "learning_rate": 0.00012008070728557186, + "loss": 0.7753, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.3852877383621456, + "learning_rate": 0.00011982671263936995, + "loss": 0.7468, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3971656730292044, + "learning_rate": 0.00011957258478066931, + "loss": 0.6555, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.412677326432461, + "learning_rate": 0.00011931832541691418, + "loss": 0.7235, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.43671968280578716, + "learning_rate": 0.00011906393625643244, + "loss": 0.7513, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.5079628329497556, + "learning_rate": 0.00011880941900842397, + "loss": 0.7377, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3922201059679018, + "learning_rate": 0.00011855477538294935, + "loss": 0.6935, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.40716418915542074, + "learning_rate": 0.00011830000709091815, + "loss": 0.7205, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4664659743807478, + "learning_rate": 0.00011804511584407763, + "loss": 0.7451, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.44727220145677477, + "learning_rate": 0.0001177901033550012, + "loss": 0.7352, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.40755267128384315, + "learning_rate": 0.00011753497133707679, + "loss": 0.6837, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.46850145166036933, + "learning_rate": 0.00011727972150449544, + "loss": 0.7688, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.3981627024681664, + "learning_rate": 0.00011702435557223987, + "loss": 0.6919, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.45156301666489845, + "learning_rate": 0.00011676887525607271, + "loss": 0.6625, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.35755911566580956, + "learning_rate": 0.00011651328227252517, + "loss": 0.6978, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.3835079024265838, + "learning_rate": 0.00011625757833888551, + "loss": 0.6633, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.47477190931220487, + "learning_rate": 0.00011600176517318741, + "loss": 0.7835, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.3891714455141602, + "learning_rate": 0.0001157458444941984, + "loss": 0.672, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3781377080874938, + "learning_rate": 0.00011548981802140848, + "loss": 0.7253, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.4411229793123389, + "learning_rate": 0.00011523368747501839, + "loss": 0.7146, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.37640745820873606, + "learning_rate": 0.00011497745457592816, + "loss": 0.7109, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.41310883543295873, + "learning_rate": 0.00011472112104572547, + "loss": 0.6555, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3918784302374364, + "learning_rate": 0.00011446468860667421, + "loss": 0.6954, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.37616228208789243, + "learning_rate": 0.0001142081589817027, + "loss": 0.7034, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.44071304271586675, + "learning_rate": 0.00011395153389439233, + "loss": 0.6857, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.4202177258854724, + "learning_rate": 0.00011369481506896582, + "loss": 0.628, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4041060891580057, + "learning_rate": 0.00011343800423027582, + "loss": 0.6533, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.4756951270911055, + "learning_rate": 0.00011318110310379301, + "loss": 0.7524, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.45831114124482203, + "learning_rate": 0.0001129241134155949, + "loss": 0.7382, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.3932523154677725, + "learning_rate": 0.00011266703689235394, + "loss": 0.6842, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4358429001327487, + "learning_rate": 0.00011240987526132594, + "loss": 0.6159, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.3655943811613015, + "learning_rate": 0.00011215263025033869, + "loss": 0.6671, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4399673893957657, + "learning_rate": 0.00011189530358778005, + "loss": 0.7069, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.43456410694718184, + "learning_rate": 0.00011163789700258655, + "loss": 0.7314, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.4633362962833651, + "learning_rate": 0.00011138041222423177, + "loss": 0.693, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.46861443447088086, + "learning_rate": 0.00011112285098271451, + "loss": 0.7057, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.42640314638531535, + "learning_rate": 0.00011086521500854745, + "loss": 0.6459, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.448308170960733, + "learning_rate": 0.00011060750603274535, + "loss": 0.7176, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.39875915954725155, + "learning_rate": 0.00011034972578681338, + "loss": 0.7211, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.37935878850667626, + "learning_rate": 0.00011009187600273566, + "loss": 0.6682, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.4263246375252377, + "learning_rate": 0.00010983395841296348, + "loss": 0.6852, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.4102525106845525, + "learning_rate": 0.00010957597475040373, + "loss": 0.6988, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.38935815027258963, + "learning_rate": 0.00010931792674840718, + "loss": 0.678, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.4754577297673937, + "learning_rate": 0.00010905981614075693, + "loss": 0.7183, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.34456311695968816, + "learning_rate": 0.00010880164466165674, + "loss": 0.6265, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.4376235627193579, + "learning_rate": 0.00010854341404571928, + "loss": 0.7115, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5207658620128204, + "learning_rate": 0.00010828512602795462, + "loss": 0.7223, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.8435111329515005, + "learning_rate": 0.00010802678234375851, + "loss": 0.647, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.39473901931373223, + "learning_rate": 0.00010776838472890065, + "loss": 0.6482, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.41239851384583864, + "learning_rate": 0.0001075099349195131, + "loss": 0.7124, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3667110865340065, + "learning_rate": 0.00010725143465207867, + "loss": 0.6974, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.36671074295549133, + "learning_rate": 0.00010699288566341914, + "loss": 0.6495, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3682874881376714, + "learning_rate": 0.00010673428969068364, + "loss": 0.6686, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.3849121179416742, + "learning_rate": 0.000106475648471337, + "loss": 0.6678, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.36527401278127675, + "learning_rate": 0.00010621696374314807, + "loss": 0.7052, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.4028325536586198, + "learning_rate": 0.00010595823724417795, + "loss": 0.7029, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.81310474354593, + "learning_rate": 0.00010569947071276847, + "loss": 0.7142, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.4460387883908326, + "learning_rate": 0.00010544066588753044, + "loss": 0.6854, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4225285883984137, + "learning_rate": 0.00010518182450733186, + "loss": 0.6885, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.43775957404339033, + "learning_rate": 0.00010492294831128641, + "loss": 0.7115, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4071503468095915, + "learning_rate": 0.00010466403903874176, + "loss": 0.7081, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.4072514865834203, + "learning_rate": 0.00010440509842926767, + "loss": 0.7329, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5130755118707295, + "learning_rate": 0.00010414612822264455, + "loss": 0.711, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.3923604141588235, + "learning_rate": 0.00010388713015885161, + "loss": 0.7086, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.41955181246507084, + "learning_rate": 0.00010362810597805526, + "loss": 0.6916, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.41686539342300905, + "learning_rate": 0.00010336905742059742, + "loss": 0.6599, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3687345210472633, + "learning_rate": 0.0001031099862269837, + "loss": 0.66, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.38787817928589796, + "learning_rate": 0.0001028508941378719, + "loss": 0.6646, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4206728964596731, + "learning_rate": 0.00010259178289406011, + "loss": 0.6985, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.4492924362995365, + "learning_rate": 0.00010233265423647523, + "loss": 0.7977, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4173882948390458, + "learning_rate": 0.00010207350990616107, + "loss": 0.661, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.4124795178345076, + "learning_rate": 0.00010181435164426676, + "loss": 0.6913, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.5040626473799648, + "learning_rate": 0.0001015551811920351, + "loss": 0.7675, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.40943133951872795, + "learning_rate": 0.00010129600029079072, + "loss": 0.6529, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.4261878138325544, + "learning_rate": 0.00010103681068192845, + "loss": 0.731, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.4423565629647469, + "learning_rate": 0.00010077761410690172, + "loss": 0.7667, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.481863110573246, + "learning_rate": 0.00010051841230721065, + "loss": 0.739, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.39023517953601977, + "learning_rate": 0.00010025920702439051, + "loss": 0.7213, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.41072257265011736, + "learning_rate": 0.0001, + "loss": 0.6929, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.5312348751411199, + "learning_rate": 9.97407929756095e-05, + "loss": 0.6789, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.40087709239929087, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6247, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.44410417507762034, + "learning_rate": 9.92223858930983e-05, + "loss": 0.6877, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4245476136267957, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7603, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.3780854932238046, + "learning_rate": 9.870399970920932e-05, + "loss": 0.6528, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.3792354421894993, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6637, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.3936634698121772, + "learning_rate": 9.818564835573323e-05, + "loss": 0.723, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.39393154842887423, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6381, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.3927738883541016, + "learning_rate": 9.766734576352478e-05, + "loss": 0.6986, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.5034317083917805, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6412, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.43113404265467736, + "learning_rate": 9.714910586212816e-05, + "loss": 0.711, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4705743937849481, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6725, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.4792403381754814, + "learning_rate": 9.663094257940258e-05, + "loss": 0.6443, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.43819077188745703, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6952, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.39775188295886227, + "learning_rate": 9.611286984114841e-05, + "loss": 0.6848, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.4046047048309528, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6982, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.4375084759429687, + "learning_rate": 9.559490157073236e-05, + "loss": 0.7474, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.44147482075562255, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7411, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.5323479852419991, + "learning_rate": 9.507705168871358e-05, + "loss": 0.7737, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5508071254334945, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7913, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.3696957269342562, + "learning_rate": 9.455933411246958e-05, + "loss": 0.657, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.5281517742483623, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7186, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.3869413765965512, + "learning_rate": 9.404176275582208e-05, + "loss": 0.6461, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3934247776585974, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7359, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.37114395834723085, + "learning_rate": 9.352435152866298e-05, + "loss": 0.6752, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.42642359171154925, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6942, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.3921630135688927, + "learning_rate": 9.300711433658087e-05, + "loss": 0.7221, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3853689735301196, + "learning_rate": 9.274856534792138e-05, + "loss": 0.6923, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.4731525770960137, + "learning_rate": 9.249006508048694e-05, + "loss": 0.7174, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.38663972282007136, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6767, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.39610131190645576, + "learning_rate": 9.197321765624152e-05, + "loss": 0.6656, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3754135509778824, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6134, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.3801330212153967, + "learning_rate": 9.145658595428074e-05, + "loss": 0.6722, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.41028895427027573, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7008, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.39141695430587464, + "learning_rate": 9.09401838592431e-05, + "loss": 0.7151, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.44112737661889, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7079, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.3718632770115604, + "learning_rate": 9.04240252495963e-05, + "loss": 0.6287, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.39364432961357876, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7042, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.4589740532198, + "learning_rate": 8.990812399726435e-05, + "loss": 0.6797, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.3693628499573939, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6285, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.3876729785604324, + "learning_rate": 8.939249396725467e-05, + "loss": 0.6568, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4985616156524874, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6603, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.37122591368859864, + "learning_rate": 8.887714901728551e-05, + "loss": 0.6536, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.40407452323589693, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6841, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.41469700299247075, + "learning_rate": 8.836210299741346e-05, + "loss": 0.667, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.4450566866828126, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7197, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.4512946184916618, + "learning_rate": 8.784736974966135e-05, + "loss": 0.7381, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3834099121261059, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6461, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.5142206798245783, + "learning_rate": 8.733296310764611e-05, + "loss": 0.6494, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.4613832855034234, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7748, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.3947092325863364, + "learning_rate": 8.6818896896207e-05, + "loss": 0.7202, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4049850438144679, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6778, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.41145705863803866, + "learning_rate": 8.63051849310342e-05, + "loss": 0.6406, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.47048080897016403, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6565, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.48566114930219795, + "learning_rate": 8.579184101829734e-05, + "loss": 0.7316, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.5316059554529852, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7287, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.3865101011840362, + "learning_rate": 8.527887895427454e-05, + "loss": 0.6617, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.4096011733838115, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6473, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.5431142279086918, + "learning_rate": 8.476631252498162e-05, + "loss": 0.6405, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.47035277608730325, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7469, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.3946973147334313, + "learning_rate": 8.425415550580162e-05, + "loss": 0.6885, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3960346996238459, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6673, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.46568227487979896, + "learning_rate": 8.374242166111448e-05, + "loss": 0.6794, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4229367779207583, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6814, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.44380573773074394, + "learning_rate": 8.323112474392731e-05, + "loss": 0.6549, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.4104428725647028, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7053, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.4608466261939257, + "learning_rate": 8.272027849550457e-05, + "loss": 0.6131, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3776890700015985, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6685, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.4599150879515932, + "learning_rate": 8.220989664499878e-05, + "loss": 0.7842, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.33902074110600416, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6193, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.4411052370972699, + "learning_rate": 8.169999290908188e-05, + "loss": 0.7052, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3747971405873802, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7022, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.46448375655372215, + "learning_rate": 8.119058099157604e-05, + "loss": 0.7483, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.3856855398064522, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6887, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.4998333150402489, + "learning_rate": 8.068167458308582e-05, + "loss": 0.7242, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.3834524748835133, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7165, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.48115621176536166, + "learning_rate": 8.017328736063006e-05, + "loss": 0.6942, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4922713984270049, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6318, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.3986544050796855, + "learning_rate": 7.966543298727425e-05, + "loss": 0.7141, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.40855269821962936, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6577, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.40501432698401973, + "learning_rate": 7.915812511176347e-05, + "loss": 0.602, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.39738148232900583, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7335, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.38766163926276814, + "learning_rate": 7.865137736815535e-05, + "loss": 0.6813, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.44383473258838163, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6852, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.42206468977725525, + "learning_rate": 7.814520337545406e-05, + "loss": 0.7053, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.4153572045211229, + "learning_rate": 7.789233578742582e-05, + "loss": 0.719, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 1.0923746965653356, + "learning_rate": 7.763961673724379e-05, + "loss": 0.6559, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.44930053107994644, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7261, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.40637606745788074, + "learning_rate": 7.713463104132345e-05, + "loss": 0.6804, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.3837904442426325, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6959, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.4284283375699517, + "learning_rate": 7.663025985934158e-05, + "loss": 0.7494, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4178362115797804, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7129, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.7761490225234231, + "learning_rate": 7.61265167464313e-05, + "loss": 0.6737, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.38018739961984366, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6219, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.3972394773066105, + "learning_rate": 7.562341524084623e-05, + "loss": 0.6465, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.45967221724548674, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6859, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.38380842446436886, + "learning_rate": 7.512096886359664e-05, + "loss": 0.6675, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.4046122518219008, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6781, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.45503231168566083, + "learning_rate": 7.461919111808595e-05, + "loss": 0.7489, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.39453696611094263, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6758, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.39531586464235496, + "learning_rate": 7.411809548974792e-05, + "loss": 0.653, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.37970796664435597, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6398, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.3943564238156354, + "learning_rate": 7.361769544568425e-05, + "loss": 0.6248, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.405054766800856, + "learning_rate": 7.336776047112276e-05, + "loss": 0.665, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.41680869276437926, + "learning_rate": 7.311800443430251e-05, + "loss": 0.6365, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.37894174988526586, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7404, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.39464461456803773, + "learning_rate": 7.26190358849548e-05, + "loss": 0.6704, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.34508384122431485, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6499, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.41096659975697875, + "learning_rate": 7.212080320757695e-05, + "loss": 0.6943, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4852612878782842, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7435, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.431449953624555, + "learning_rate": 7.162331979232783e-05, + "loss": 0.6313, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.34161717551810117, + "learning_rate": 7.137486323692995e-05, + "loss": 0.5986, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.4624244394195268, + "learning_rate": 7.112659900922976e-05, + "loss": 0.6592, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.41256403278587855, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6655, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.41735435891969386, + "learning_rate": 7.06306542078091e-05, + "loss": 0.6969, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.38794315783957806, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6903, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.5452037323592183, + "learning_rate": 7.013549871673736e-05, + "loss": 0.6451, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.3980413708111724, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6393, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.40636128158099566, + "learning_rate": 6.964114584347316e-05, + "loss": 0.6718, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4437989080933422, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7368, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.3793297983580521, + "learning_rate": 6.914760887390452e-05, + "loss": 0.6655, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4062062166979556, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7844, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.4262858478328053, + "learning_rate": 6.865490107199181e-05, + "loss": 0.6992, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3807916736194714, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6445, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.3845725135795709, + "learning_rate": 6.816303567941112e-05, + "loss": 0.6203, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.418809086744098, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6432, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.383022219002213, + "learning_rate": 6.767202591519875e-05, + "loss": 0.6512, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.47668664566503743, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7295, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.4106101853039467, + "learning_rate": 6.718188497539554e-05, + "loss": 0.7128, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.38449380526734445, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6213, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.3680674432824615, + "learning_rate": 6.669262603269246e-05, + "loss": 0.6254, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.40957792884479693, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7017, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.43382556405890244, + "learning_rate": 6.620426223607654e-05, + "loss": 0.6854, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.41409917037082994, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6815, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.42918642809051144, + "learning_rate": 6.571680671047749e-05, + "loss": 0.7014, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.44385630524030634, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7144, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.43617944813056336, + "learning_rate": 6.523027255641493e-05, + "loss": 0.7076, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.35464296416961627, + "learning_rate": 6.498735508086093e-05, + "loss": 0.639, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.48189683832526153, + "learning_rate": 6.474467284964634e-05, + "loss": 0.7592, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4501052381719782, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6808, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.4159299664040812, + "learning_rate": 6.426002064081565e-05, + "loss": 0.705, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.445291134682033, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6126, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.39663608464677486, + "learning_rate": 6.377632895510248e-05, + "loss": 0.6834, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.38177297587194176, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6623, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.4124401789745345, + "learning_rate": 6.329361079187199e-05, + "loss": 0.7397, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.37116529704099843, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6629, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.3469758455320982, + "learning_rate": 6.281187912432587e-05, + "loss": 0.6509, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3767146131837542, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6508, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.4274285346901684, + "learning_rate": 6.233114689915316e-05, + "loss": 0.6655, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.5099042034462896, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7014, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.3889311658102342, + "learning_rate": 6.18514270361827e-05, + "loss": 0.6186, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.44392756556456253, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7286, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.3765472685597652, + "learning_rate": 6.13727324280358e-05, + "loss": 0.6504, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.39876856119905346, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7048, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.3815269716249916, + "learning_rate": 6.08950759397797e-05, + "loss": 0.6685, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.5600926327379202, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6752, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.3536445681248279, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.6389, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.37019990622790294, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6817, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.4188872710433534, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.681, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3777204049485853, + "learning_rate": 5.970556066797941e-05, + "loss": 0.613, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.3880905404693292, + "learning_rate": 5.946846342446214e-05, + "loss": 0.6818, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.44790623765449117, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6785, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.44201618408608107, + "learning_rate": 5.899508750327501e-05, + "loss": 0.6975, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4880898295884519, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6668, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.381133231330232, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.6754, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.363082248641178, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6392, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.41303316154592884, + "learning_rate": 5.80516544129337e-05, + "loss": 0.6765, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.41012381027069006, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7106, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.3805134062461334, + "learning_rate": 5.758162259883867e-05, + "loss": 0.6033, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3669597657684683, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6659, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.4329871339377622, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.6515, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.40056243679097336, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6766, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.403521276952749, + "learning_rate": 5.664499159372017e-05, + "loss": 0.6841, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.4160653655260832, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6757, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.38991222836224504, + "learning_rate": 5.617841757494762e-05, + "loss": 0.5927, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.39066570019338576, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6704, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.3828468475809412, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.6492, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.37868487565289066, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6778, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.44244296442123804, + "learning_rate": 5.524881520125229e-05, + "loss": 0.5999, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.38987052538579636, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6351, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.38199122254894297, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.6326, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3639229880437657, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6247, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.5315947852451907, + "learning_rate": 5.432402360355615e-05, + "loss": 0.6495, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.35014658605865756, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6392, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.47874843630853453, + "learning_rate": 5.386346293357242e-05, + "loss": 0.7146, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.4629085694326274, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7138, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.3933960103130106, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.7079, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3852771061375175, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6126, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.434071766724914, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.6609, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.41621675469846436, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6563, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.4393344799203817, + "learning_rate": 5.248926987065417e-05, + "loss": 0.6674, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.39587381501685853, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6809, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.4231237232657075, + "learning_rate": 5.203374286747158e-05, + "loss": 0.7091, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4103037220562414, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6678, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.40847021747295903, + "learning_rate": 5.15795049724435e-05, + "loss": 0.6726, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.4048507165934699, + "learning_rate": 5.135287325678271e-05, + "loss": 0.5838, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.4267656205061373, + "learning_rate": 5.112656839335543e-05, + "loss": 0.6898, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.40412044905045197, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6411, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.39907703234527525, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.645, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4313521298063378, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6954, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.45219077808300984, + "learning_rate": 5.022464783894744e-05, + "loss": 0.7019, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.34522413336359226, + "learning_rate": 5.000000000000002e-05, + "loss": 0.615, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.42276722714015846, + "learning_rate": 4.977568810302432e-05, + "loss": 0.6134, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4450484853316365, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7124, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.4176859898117107, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.6494, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.3918207673921806, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6935, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.4687812943620728, + "learning_rate": 4.88818300430819e-05, + "loss": 0.708, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.42328745629807996, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6395, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.39422881345065175, + "learning_rate": 4.843695574177737e-05, + "loss": 0.6222, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.42531681271013405, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7189, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.39062939224071663, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.7097, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5039033039514914, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6743, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.44359403445712453, + "learning_rate": 4.755137637685979e-05, + "loss": 0.6597, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.45656153532577914, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7161, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.3821800534278985, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.6403, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.5137085229706376, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6678, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.4267622522554168, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.669, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.3927474593428916, + "learning_rate": 4.645234206515171e-05, + "loss": 0.5818, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.4214596604287487, + "learning_rate": 4.623360864173893e-05, + "loss": 0.5903, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4435106768069405, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.654, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.4635596392892368, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6736, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3919107712030242, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.5857, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.3903892283671009, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.6375, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5529360557826836, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7635, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.4937511215541514, + "learning_rate": 4.492884557078688e-05, + "loss": 0.7357, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.5695510559614142, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6876, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.42617656739881216, + "learning_rate": 4.449686911058992e-05, + "loss": 0.6663, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4357966592095588, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6121, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.4171737223706917, + "learning_rate": 4.406638431438576e-05, + "loss": 0.6765, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.42444773335585284, + "learning_rate": 4.385170490729712e-05, + "loss": 0.689, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.38148418948543755, + "learning_rate": 4.36374027515878e-05, + "loss": 0.6771, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4222079247626194, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7185, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.36136220572730077, + "learning_rate": 4.320993595120969e-05, + "loss": 0.6137, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4429167676676073, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6396, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.40876086503481196, + "learning_rate": 4.278399540155536e-05, + "loss": 0.7096, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.4343712585264002, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6385, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.41837706104168265, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.6652, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.41121496227160353, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6452, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.46487731139413196, + "learning_rate": 4.193673880223339e-05, + "loss": 0.6356, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.48347206478487204, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6898, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.4712607622214813, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.6859, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3746852972884601, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6656, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.3978423947002656, + "learning_rate": 4.109572403415386e-05, + "loss": 0.5327, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.38911813310438403, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7227, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.3797046115124046, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.6511, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.4430462860485526, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6417, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.45331754482416553, + "learning_rate": 4.026104150684835e-05, + "loss": 0.6687, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.39912659038055504, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6594, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.5155553428149129, + "learning_rate": 3.984610290059467e-05, + "loss": 0.7029, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.37902154478411765, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6179, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.47051796061468243, + "learning_rate": 3.943278094912946e-05, + "loss": 0.6677, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4022406821297786, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6697, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.4257374265228528, + "learning_rate": 3.902108676060937e-05, + "loss": 0.6392, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.47383434981133715, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.5611, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.47514786323404323, + "learning_rate": 3.861103139944449e-05, + "loss": 0.7033, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.4359384293749101, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6611, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.5329107588299355, + "learning_rate": 3.820262588600074e-05, + "loss": 0.6733, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3304655053635868, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6083, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.378930117484927, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.6078, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3642262972712831, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6621, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.4426647290306224, + "learning_rate": 3.739080826174498e-05, + "loss": 0.6038, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4059507975046885, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.5983, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.4200147388174036, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.6695, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4191820120110178, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6848, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.3727689478218868, + "learning_rate": 3.658572115866541e-05, + "loss": 0.5903, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.34761139526376134, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6076, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.5014658551714443, + "learning_rate": 3.618572862711247e-05, + "loss": 0.649, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.40160927235903393, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6503, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.4308949226680645, + "learning_rate": 3.578745112405083e-05, + "loss": 0.6698, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.40597972271838906, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6436, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.3912854444629693, + "learning_rate": 3.539089935331294e-05, + "loss": 0.6138, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.39805403278461843, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6474, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.35918475556838036, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.6705, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.3639389173156511, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6059, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.4551968784336033, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.6514, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.5451495288183975, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7396, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.3608943874649447, + "learning_rate": 3.421170477595419e-05, + "loss": 0.6159, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.41365460998134396, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6644, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.3965052298702251, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.6753, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.43263803632988235, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6215, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.39280403682379533, + "learning_rate": 3.34343978560367e-05, + "loss": 0.5863, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.4007479070398563, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6805, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.3912267539224064, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.6425, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.43134889815808386, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6662, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.39112026109024267, + "learning_rate": 3.266424677350346e-05, + "loss": 0.6069, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.40911415988552713, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6476, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.3943492444774961, + "learning_rate": 3.228188057393895e-05, + "loss": 0.6208, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4628673671073314, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6966, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.4632930924390139, + "learning_rate": 3.190133432000252e-05, + "loss": 0.6682, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.441107627731924, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6452, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.4149189127802021, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.6694, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.43893968996144517, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.645, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.4077552936838339, + "learning_rate": 3.114574250902558e-05, + "loss": 0.6077, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.40902726640833204, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7078, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.40357820245744364, + "learning_rate": 3.077071725875116e-05, + "loss": 0.6171, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.4303800840793108, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6688, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.41147230904968735, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.6778, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.4314031095444482, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7021, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.4336949835297048, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.7098, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3764657579232399, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.5987, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.38838123922216483, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.659, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.37617887630060026, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6288, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.45464241972856073, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.6545, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4588920969504832, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7424, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.3791854005804689, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.6615, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.45851458768557096, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7496, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.44052936313969776, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.6866, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.5055435530031537, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6968, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.6099090576682689, + "learning_rate": 2.819819423336775e-05, + "loss": 0.6626, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.510700705992291, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.644, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.3867445958735247, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.6508, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.41489503739413236, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6463, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.4590763119487639, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.7661, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.46812897926817154, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7117, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.36936210084673, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6405, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.39761159107484295, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6744, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.34329473092381946, + "learning_rate": 2.677041764010988e-05, + "loss": 0.6376, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.4007428706691422, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6415, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.44642802669889675, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.6554, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4575153088661458, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6602, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.4069088341499625, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.6455, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.4456047523904863, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6204, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.3676766196918783, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.6873, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.433664929507155, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.69, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.373162863636098, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.6281, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.4117490437737598, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.653, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.428626925312621, + "learning_rate": 2.503004759861258e-05, + "loss": 0.6534, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.7975980075263391, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6506, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.4338313154865431, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.6192, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.47683533218375246, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6969, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.3709437292440277, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.5725, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.42017519667588804, + "learning_rate": 2.417867893002387e-05, + "loss": 0.671, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.4221200609873446, + "learning_rate": 2.400992893100822e-05, + "loss": 0.6252, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.37923947147275283, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6631, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.42255167433776425, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.6233, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.44733555174892403, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6334, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.3676281287462771, + "learning_rate": 2.334004587234717e-05, + "loss": 0.6719, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.46024687896834504, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6495, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.4360284336010514, + "learning_rate": 2.300819024631603e-05, + "loss": 0.6484, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.35607065705282054, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.585, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.4318624799063715, + "learning_rate": 2.26784037992395e-05, + "loss": 0.6451, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.42463272975768457, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6471, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.42297250293547317, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.6703, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3743925777371385, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6716, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.4067119571591152, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.7036, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.41406160097681877, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6514, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.4183605374845178, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.6575, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.6337103314644563, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6252, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.4906999686648005, + "learning_rate": 2.138012622361689e-05, + "loss": 0.695, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.5225891237739163, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6964, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.4930718308475257, + "learning_rate": 2.106081749751897e-05, + "loss": 0.7037, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5276861081614141, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.622, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.42832404920430694, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.6069, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.466458746148192, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6756, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.4544678392027321, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.6864, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.3826745785572623, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6347, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.43155318180539814, + "learning_rate": 2.011565445123711e-05, + "loss": 0.651, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4994300693272774, + "learning_rate": 1.995999968955641e-05, + "loss": 0.7049, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.5943748518338728, + "learning_rate": 1.980488270378612e-05, + "loss": 0.64, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.47217079552895913, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6817, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.49365401456324437, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.6657, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.43436189418749294, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6456, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.6684493187722501, + "learning_rate": 1.918981330958678e-05, + "loss": 0.636, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4405913490891985, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6225, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.410785975404595, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.6639, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.36547794828352415, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.5788, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.4372170956601188, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.6536, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.5009233718100731, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.757, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.4056217110684542, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.6467, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.46129553983661364, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6756, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.46185062607524163, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.7277, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.41935766814476017, + "learning_rate": 1.783776873795994e-05, + "loss": 0.5969, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.4443093688967267, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.6993, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3942385246217329, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6688, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.42821894152387097, + "learning_rate": 1.739698775823442e-05, + "loss": 0.7111, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.3662724786286339, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6564, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.36878600508252474, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.6006, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.4065974482647705, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6344, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.39792442091365, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.6029, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.37137896118893476, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6465, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.42008803683308704, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.6509, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4386831052560911, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6753, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.4633449001805289, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.6253, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.44299630226182, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6711, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.41085416199217467, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.6538, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.4484359919195243, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6143, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.4282550300484898, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.6673, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.4191224307491595, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6968, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.4396271764513739, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.6818, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.37387472496239993, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6773, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.48374131896293865, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.6646, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.43633236192352065, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7082, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.4239669218101152, + "learning_rate": 1.485810737340767e-05, + "loss": 0.6653, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3864223719763971, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6042, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.42555231885020994, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.6615, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.3722682023776691, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.5767, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.4337623849323105, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.7483, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.39865898952737927, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7382, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.4437071931347057, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.6463, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4008299282027897, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6396, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.4009493165215926, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.6523, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.5062076346003773, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.5861, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.4516942937441003, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.691, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4853529279383495, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6001, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.37905471747272673, + "learning_rate": 1.326814704364262e-05, + "loss": 0.6328, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.5284608677347281, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6908, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.38608717956848915, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.6388, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.44267069516551083, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6231, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.3786824586900781, + "learning_rate": 1.275673273546758e-05, + "loss": 0.6136, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.4270504804937909, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6791, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.5038946234523334, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.7804, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4623293214277537, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6766, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.44301346960741594, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.6477, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.6954121952109877, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6503, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.4044191973360963, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.6569, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.4660644586496118, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.686, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.5634031960415944, + "learning_rate": 1.176209418012495e-05, + "loss": 0.6161, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.40473907603181797, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6308, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.440199941429629, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.6437, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.42154096060283036, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6591, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.3875040322959066, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.6605, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.4303409905159609, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.659, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.3674634232741053, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.6518, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.7316007602226419, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7053, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.49302790603798097, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.6461, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.3723886365425195, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6411, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.37854206745230956, + "learning_rate": 1.057219974130903e-05, + "loss": 0.644, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.4193425252301274, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6764, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.35650860727341804, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.5994, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.44021936147114227, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6186, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.4162720336568276, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.6584, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.44300557421485726, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6626, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.47213299117838864, + "learning_rate": 9.887052838721322e-06, + "loss": 0.6772, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.38534027867094606, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6308, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.4952808746046796, + "learning_rate": 9.663506046162985e-06, + "loss": 0.6861, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.4208322108941599, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6718, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.4704840462474498, + "learning_rate": 9.44238707511862e-06, + "loss": 0.7059, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.40080120473996755, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6246, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.44759857005995657, + "learning_rate": 9.22370186822965e-06, + "loss": 0.6728, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3960185199853362, + "learning_rate": 9.115273765538202e-06, + "loss": 0.5488, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.36610015351743425, + "learning_rate": 9.0074563027294e-06, + "loss": 0.5978, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.4472618735917445, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6497, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.464807452611619, + "learning_rate": 8.79365619028507e-06, + "loss": 0.7345, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4709506037877623, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7311, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.3358466208808786, + "learning_rate": 8.582307276841462e-06, + "loss": 0.564, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.42600342697067267, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6216, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.45028467405895545, + "learning_rate": 8.37341524246672e-06, + "loss": 0.6409, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.557329627370974, + "learning_rate": 8.269892311900696e-06, + "loss": 0.7207, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.4136112769160197, + "learning_rate": 8.166985701199582e-06, + "loss": 0.5935, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.37329400859832346, + "learning_rate": 8.064696101776358e-06, + "loss": 0.622, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.43682334669658196, + "learning_rate": 7.963024200898462e-06, + "loss": 0.6404, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3748636097341092, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6223, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.42722010259960563, + "learning_rate": 7.761536223092458e-06, + "loss": 0.6276, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.4518428694340991, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6902, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.3623217419092838, + "learning_rate": 7.562527182833978e-06, + "loss": 0.6293, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.5573806212574868, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6891, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.3991199845067119, + "learning_rate": 7.366002428553153e-06, + "loss": 0.7093, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.4170610132690793, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6367, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.3596897692127439, + "learning_rate": 7.171967241914224e-06, + "loss": 0.612, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.41697308922376014, + "learning_rate": 7.07588486868922e-06, + "loss": 0.688, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.4112266335214538, + "learning_rate": 6.980426837673437e-06, + "loss": 0.596, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.484535098122851, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6357, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.4145119576764233, + "learning_rate": 6.791386363539065e-06, + "loss": 0.6603, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3845276967918898, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.629, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.478000040837266, + "learning_rate": 6.604850900032955e-06, + "loss": 0.6603, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.4637204727474049, + "learning_rate": 6.512524116523633e-06, + "loss": 0.642, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.3928239925222747, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6576, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4231514273592192, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6574, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.48355232084777794, + "learning_rate": 6.239314990243339e-06, + "loss": 0.6217, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.5947856046562446, + "learning_rate": 6.149504395842087e-06, + "loss": 0.7306, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.36458883310346435, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.6211, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3960739949887535, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6589, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.36023352854279544, + "learning_rate": 5.883858403607967e-06, + "loss": 0.6245, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.40780844491338397, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6174, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.582970005377322, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.7372, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.38500948239471194, + "learning_rate": 5.623903547074549e-06, + "loss": 0.5915, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.4287912176074291, + "learning_rate": 5.538519351897575e-06, + "loss": 0.6193, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4198902015159911, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6236, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.38233551950051503, + "learning_rate": 5.369655545525909e-06, + "loss": 0.6016, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.5395772738169392, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6322, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.44702077463576917, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.6535, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.41079916482077417, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6167, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.37665556305187514, + "learning_rate": 5.039562062965508e-06, + "loss": 0.6007, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.602870889721456, + "learning_rate": 4.95863237670956e-06, + "loss": 0.5824, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.4234135876004459, + "learning_rate": 4.87834125814235e-06, + "loss": 0.6431, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.36822255437976564, + "learning_rate": 4.798689246727006e-06, + "loss": 0.652, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.36510134079543993, + "learning_rate": 4.719676877632639e-06, + "loss": 0.5875, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3729073618035483, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6507, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.3523972331305135, + "learning_rate": 4.563573185591219e-06, + "loss": 0.6131, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.4072440360190982, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6777, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.34919183194492326, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.6133, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.42513790124721773, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6218, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.46900697721960166, + "learning_rate": 4.259064579323302e-06, + "loss": 0.6291, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.49638656881266513, + "learning_rate": 4.184544329761009e-06, + "loss": 0.654, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.4080519919725839, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.7047, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.4208253803362493, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5866, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.4925975823169105, + "learning_rate": 3.964848174174541e-06, + "loss": 0.7029, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.47362666344716114, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7168, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.4397762197275533, + "learning_rate": 3.821609474213983e-06, + "loss": 0.5982, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4773330206512832, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6696, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.41715927620580495, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.6236, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4872669921994879, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6887, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.3739624404078296, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.5864, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4644012923456862, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.5956, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.5171644364848799, + "learning_rate": 3.40741737109318e-06, + "loss": 0.6794, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.5036272549775639, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6631, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.4084323550438053, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.6714, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4582317574681205, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6538, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.39260460346022474, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.6378, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.40838108087739916, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6872, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.38946336528002273, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.6622, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.37207487789605137, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6444, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.49153673402231457, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.7307, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.40195076339811375, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.5939, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.41042476999125416, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.5692, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.4685435198499234, + "learning_rate": 2.708812932856253e-06, + "loss": 0.5941, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.3563779620133779, + "learning_rate": 2.649217248223468e-06, + "loss": 0.6011, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.3983783041419285, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6511, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.3517579195645366, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6642, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.370928425327566, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6292, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.4060622147021784, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.6731, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.5931334860219275, + "learning_rate": 2.3610579436393e-06, + "loss": 0.5995, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.5331085365721472, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.6593, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3598383122326138, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6384, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.3915947721446365, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.5701, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.3821055121736776, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6553, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.3602041145233819, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.7049, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.44548436852189566, + "learning_rate": 2.036919225091827e-06, + "loss": 0.668, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.4373904941373431, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.6402, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.438827373751763, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6561, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.4362264956140032, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.643, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4044806923870212, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6624, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.4000772256196655, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.5853, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.4083796539848895, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.5917, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.3894230156325759, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6288, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.36107007695240323, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.5881, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.45402894888204026, + "learning_rate": 1.595161589389449e-06, + "loss": 0.717, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.34806835664596164, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6233, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.39793870698491896, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.6874, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.39401129084286485, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6662, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.37772600752742874, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.6862, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.37450273581911875, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6451, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.38176089301787547, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.6373, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.35358647926829284, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.663, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.4602139471488554, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.6226, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.5416889606730806, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6362, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.4036750223093132, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.6532, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.38404678255442, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6097, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.4267377090747644, + "learning_rate": 1.089491988176017e-06, + "loss": 0.6923, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.41348061051676394, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.688, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.4185678843563295, + "learning_rate": 1.014505010326583e-06, + "loss": 0.6636, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3344176284141208, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5905, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.43078748964380953, + "learning_rate": 9.421782985976068e-07, + "loss": 0.6599, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.3935543093104591, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6527, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.3332478313177662, + "learning_rate": 8.725137967920738e-07, + "loss": 0.5986, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.43121211559573536, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6474, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.38363861306199587, + "learning_rate": 8.055133771652345e-07, + "loss": 0.6647, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.36619989098104866, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6225, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.4718860301100266, + "learning_rate": 7.411788403743237e-07, + "loss": 0.7245, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.40222763581499665, + "learning_rate": 7.100118211581852e-07, + "loss": 0.642, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.4880004363496894, + "learning_rate": 6.7951191543012e-07, + "loss": 0.7039, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.3837309331013992, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6673, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.4009821811006916, + "learning_rate": 6.205142596505176e-07, + "loss": 0.6035, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.404170233361831, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6529, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.44007382139111645, + "learning_rate": 5.64187458615939e-07, + "loss": 0.6554, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.40124653569122576, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6451, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.4067421755106845, + "learning_rate": 5.105330261267916e-07, + "loss": 0.6618, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.42871209202919514, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6348, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.40897502568707533, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.6503, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4115778403597554, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6875, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.3888626628329367, + "learning_rate": 4.112469628438365e-07, + "loss": 0.6878, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3898464069550168, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6892, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.4240977211745303, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.6978, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4083436974329831, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.648, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.43683232074338657, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.6371, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.44405962563885487, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6408, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.40189618267215416, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.6206, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.38143520659589464, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.65, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.553594963156896, + "learning_rate": 2.448018893333681e-07, + "loss": 0.6835, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4606935828464384, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.7302, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.5377134495942296, + "learning_rate": 2.098903854912515e-07, + "loss": 0.7018, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4447776733368485, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6602, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.34772105247308416, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.6197, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4392583568148265, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.7154, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.4253963088334093, + "learning_rate": 1.481139151579991e-07, + "loss": 0.6482, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.39185547598418924, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6828, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.4071068410359769, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6466, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.579665557785551, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6259, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.4045183788046934, + "learning_rate": 9.707157531134713e-08, + "loss": 0.6416, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.40827472515450947, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6263, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.4048930082559139, + "learning_rate": 7.557746412468758e-08, + "loss": 0.643, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3981307050442245, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5864, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.4202381520920859, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6426, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.4132844897416911, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6767, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.42446177971468574, + "learning_rate": 4.064624751394242e-08, + "loss": 0.6668, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.4365754108889452, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7009, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.4208860054391876, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.6772, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.37814548759802874, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6669, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.42343131577798393, + "learning_rate": 1.646071422083395e-08, + "loss": 0.6502, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4327753082312952, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6694, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.3943042623046033, + "learning_rate": 8.398436437317969e-09, + "loss": 0.6288, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.41688071618548456, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6374, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.5003834831982291, + "learning_rate": 3.023464202944748e-09, + "loss": 0.6499, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.45866807006109467, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6706, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.3916081291545642, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.6381, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.4770064931608081, + "learning_rate": 0.0, + "loss": 0.7777, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1096393255911424.0, + "train_loss": 0.7163409863471984, + "train_runtime": 19465.3314, + "train_samples_per_second": 1.027, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1096393255911424.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c30a80f55782215f57694327607360c6ad690188 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "up_proj", + "gate_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5e0187cc3747e5cfed2719b67b0e3550e2bba270 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a78cb0ee8becea056080a5e85bebb05c98c6596c42b552ea117abb9b85680339 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..c30922df6c124f2d37be9677cc351d74fd23c500 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a43ff62a24b2b8a809607b50c5de7f0ee2bdb37db617f9fd96e81da20c929c5 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6483ef95e0c7a1741c80563288ee6861c26cad53 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.9564362205502492, + "learning_rate": 5e-05, + "loss": 1.3891, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 1.0670570078098212, + "learning_rate": 0.0001, + "loss": 1.5647, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.6832004913511157, + "learning_rate": 0.00015000000000000001, + "loss": 1.1882, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 1.4040934204807518, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.0594878077292809, + "learning_rate": 0.00019996629653035126, + "loss": 1.0504, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.7880067192842751, + "learning_rate": 0.00019986520883988232, + "loss": 1.0003, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.6095190630196066, + "learning_rate": 0.00019969680506871137, + "loss": 1.0152, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.5250870981951875, + "learning_rate": 0.00019946119873266613, + "loss": 0.961, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.5100945930231707, + "learning_rate": 0.00019915854864676664, + "loss": 0.9172, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.6205753925466002, + "learning_rate": 0.00019878905881817252, + "loss": 0.9564, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.5508857901203891, + "learning_rate": 0.00019835297830866826, + "loss": 0.8852, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.6483010342507907, + "learning_rate": 0.00019785060106677818, + "loss": 1.0694, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.5668252221073122, + "learning_rate": 0.00019728226572962473, + "loss": 0.9364, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.5813967382806945, + "learning_rate": 0.0001966483553946637, + "loss": 0.8932, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.49591366297717443, + "learning_rate": 0.00019594929736144976, + "loss": 0.831, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.4672597855174128, + "learning_rate": 0.00019518556284360696, + "loss": 0.9122, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.4744190657329229, + "learning_rate": 0.0001943576666511982, + "loss": 0.889, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.4733278011435993, + "learning_rate": 0.0001934661668437073, + "loss": 0.78, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.5074702125395911, + "learning_rate": 0.0001925116643538684, + "loss": 0.929, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.46463115723052933, + "learning_rate": 0.00019149480258259533, + "loss": 0.8786, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 1.1053535634091058, + "learning_rate": 0.00019041626696528503, + "loss": 0.9157, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.5765958674318559, + "learning_rate": 0.0001892767845097864, + "loss": 0.8675, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.5301032501211107, + "learning_rate": 0.00018807712330634642, + "loss": 0.8631, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.48065832362239796, + "learning_rate": 0.0001868180920098644, + "loss": 0.8291, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.40329194748717834, + "learning_rate": 0.00018550053929480202, + "loss": 0.8386, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.4518990740306019, + "learning_rate": 0.00018412535328311814, + "loss": 0.8682, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.4849664224399919, + "learning_rate": 0.0001826934609456129, + "loss": 0.9325, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.4892145977372225, + "learning_rate": 0.00018120582747708502, + "loss": 0.8931, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.4700856635995607, + "learning_rate": 0.0001796634556457236, + "loss": 0.8751, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.5337272569459788, + "learning_rate": 0.0001780673851171728, + "loss": 0.8884, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.43616755029040166, + "learning_rate": 0.00017641869175372493, + "loss": 0.8273, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.4802536284656567, + "learning_rate": 0.00017471848688911464, + "loss": 0.8929, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.48669517596505296, + "learning_rate": 0.000172967916579403, + "loss": 0.9583, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.4006984433764588, + "learning_rate": 0.00017116816083045602, + "loss": 0.8102, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.3804581210152902, + "learning_rate": 0.0001693204328025389, + "loss": 0.8244, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.4018169810718193, + "learning_rate": 0.00016742597799256182, + "loss": 0.8337, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.38601705911275547, + "learning_rate": 0.00016548607339452853, + "loss": 0.8522, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.4770897944661433, + "learning_rate": 0.00016350202663875386, + "loss": 0.8619, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.4436139687419418, + "learning_rate": 0.0001614751751104301, + "loss": 0.7643, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.4510651437608395, + "learning_rate": 0.00015940688504813662, + "loss": 0.8198, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.5204234563209176, + "learning_rate": 0.00015729855062290022, + "loss": 0.8797, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.4553293071795067, + "learning_rate": 0.00015515159299842707, + "loss": 0.7873, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.451874650212316, + "learning_rate": 0.00015296745937313987, + "loss": 0.8664, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.45660075722936766, + "learning_rate": 0.00015074762200466556, + "loss": 0.8353, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.4578324100950842, + "learning_rate": 0.00014849357721743168, + "loss": 0.8545, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.3901315778595456, + "learning_rate": 0.00014620684439403962, + "loss": 0.7646, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.4709543810589498, + "learning_rate": 0.0001438889649510956, + "loss": 0.8438, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.45352980772919727, + "learning_rate": 0.00014154150130018866, + "loss": 0.7773, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.5315738799272542, + "learning_rate": 0.00013916603579471705, + "loss": 0.8384, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.44764161698075966, + "learning_rate": 0.000136764169663272, + "loss": 0.8097, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.451036919752691, + "learning_rate": 0.00013433752193029886, + "loss": 0.805, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.3772027967638337, + "learning_rate": 0.00013188772832476188, + "loss": 0.7472, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.40519776999433677, + "learning_rate": 0.00012941644017754964, + "loss": 0.7577, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.40847094673729234, + "learning_rate": 0.00012692532330836346, + "loss": 0.7304, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.4061703470017864, + "learning_rate": 0.00012441605690283915, + "loss": 0.8478, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.5284646726291284, + "learning_rate": 0.0001218903323806595, + "loss": 0.87, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.4072381711235819, + "learning_rate": 0.00011934985225541998, + "loss": 0.7865, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.4638388427170942, + "learning_rate": 0.00011679632898701649, + "loss": 0.8448, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.4255657720832117, + "learning_rate": 0.00011423148382732853, + "loss": 0.8351, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.41307171477445137, + "learning_rate": 0.00011165704565997593, + "loss": 0.7264, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.39579024422674575, + "learning_rate": 0.00010907474983493144, + "loss": 0.7356, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.4694881217262419, + "learning_rate": 0.0001064863369987743, + "loss": 0.8502, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.4001519238520662, + "learning_rate": 0.00010389355192137377, + "loss": 0.7873, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.42210737351272143, + "learning_rate": 0.0001012981423197931, + "loss": 0.8427, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.40525818022776766, + "learning_rate": 9.870185768020693e-05, + "loss": 0.8052, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.6122927350978119, + "learning_rate": 9.610644807862625e-05, + "loss": 0.8228, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.42711811000314354, + "learning_rate": 9.35136630012257e-05, + "loss": 0.7886, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.4495790965625436, + "learning_rate": 9.092525016506858e-05, + "loss": 0.7685, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.4413267802962418, + "learning_rate": 8.83429543400241e-05, + "loss": 0.7577, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.497785940917474, + "learning_rate": 8.57685161726715e-05, + "loss": 0.7098, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.39572697753004543, + "learning_rate": 8.320367101298351e-05, + "loss": 0.7946, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.4324613617825247, + "learning_rate": 8.065014774458003e-05, + "loss": 0.8368, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.5311744381229567, + "learning_rate": 7.810966761934053e-05, + "loss": 0.8749, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.42782020928719666, + "learning_rate": 7.558394309716088e-05, + "loss": 0.8097, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.4268388603862346, + "learning_rate": 7.307467669163655e-05, + "loss": 0.7788, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.4418608750620497, + "learning_rate": 7.058355982245037e-05, + "loss": 0.8151, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.4513775352416818, + "learning_rate": 6.811227167523815e-05, + "loss": 0.8862, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.4288780904259751, + "learning_rate": 6.566247806970119e-05, + "loss": 0.8145, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.4090576841293381, + "learning_rate": 6.323583033672799e-05, + "loss": 0.8277, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.45189215953496836, + "learning_rate": 6.083396420528298e-05, + "loss": 0.7855, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.45422260123346536, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7714, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.91202440455519, + "learning_rate": 5.611103504890444e-05, + "loss": 0.8794, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.9149529602442982, + "learning_rate": 5.379315560596038e-05, + "loss": 0.7361, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.4546391256597834, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.7817, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.42238075335008324, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.7695, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.4424948789637984, + "learning_rate": 4.703254062686017e-05, + "loss": 0.7948, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.3982729241150964, + "learning_rate": 4.484840700157295e-05, + "loss": 0.7599, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.44348156245108955, + "learning_rate": 4.270144937709981e-05, + "loss": 0.7968, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.40263627071712976, + "learning_rate": 4.059311495186338e-05, + "loss": 0.7015, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.36656617219628396, + "learning_rate": 3.852482488956992e-05, + "loss": 0.7617, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.434413757991748, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7809, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.5181032894210132, + "learning_rate": 3.45139266054715e-05, + "loss": 0.7603, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.4095041782820746, + "learning_rate": 3.257402200743821e-05, + "loss": 0.7155, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.42694499417908965, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.734, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.33798217911043893, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.7886, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.41589903056727734, + "learning_rate": 2.7032083420597e-05, + "loss": 0.7421, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.445056779586077, + "learning_rate": 2.528151311088537e-05, + "loss": 0.7443, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.40780163543891135, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.789, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.4936441498703536, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.843, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.40784407447552307, + "learning_rate": 2.03365443542764e-05, + "loss": 0.7378, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.49759805586308553, + "learning_rate": 1.879417252291502e-05, + "loss": 0.7839, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.40226259358803806, + "learning_rate": 1.730653905438714e-05, + "loss": 0.7403, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.4404451039655544, + "learning_rate": 1.587464671688187e-05, + "loss": 0.7991, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.5215311872478593, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.767, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.8015353117608185, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.8428, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.4033768748067044, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.7629, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.43468570005173046, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.8203, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.4314271380610339, + "learning_rate": 9.583733034714981e-06, + "loss": 0.7522, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.4099555584962568, + "learning_rate": 8.505197417404687e-06, + "loss": 0.774, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.39192345338489143, + "learning_rate": 7.488335646131628e-06, + "loss": 0.7854, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.5606142634633692, + "learning_rate": 6.533833156292679e-06, + "loss": 0.8025, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.4125228048773097, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.7515, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.4551086684708683, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.8832, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.4700100607241755, + "learning_rate": 4.050702638550275e-06, + "loss": 0.7774, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.4022309925214962, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.7639, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.35237768138172054, + "learning_rate": 2.717734270375272e-06, + "loss": 0.7439, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.5271039606272312, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.7181, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.4071465539898394, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.7526, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.3971023247167748, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.6728, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.40853435166821706, + "learning_rate": 8.41451353233369e-07, + "loss": 0.7731, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.4279732050533312, + "learning_rate": 5.388012673338661e-07, + "loss": 0.7563, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.5012557378562927, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7848, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.4519002693851681, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.8422, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.4089364350515701, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7779, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.40046147854066166, + "learning_rate": 0.0, + "loss": 0.7812, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 108391249608704.0, + "train_loss": 0.8369061512947082, + "train_runtime": 1950.8409, + "train_samples_per_second": 1.025, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 108391249608704.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11cf9228d509db53df520199293cf90dab9e7f42 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "gate_proj", + "down_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5121f81caa5aeeb781cc49c4d5d4e9681a90a495 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66d8cbbfc9b287ff28dba1e24911d0eb16199d893f93599beda9d29e924e1a15 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..06bbc4c079ad9a668618f5aa287ecb8da126ce37 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:442b2c77a52040812daa27c790b34f713f5ac98306fae36fb1e83f0dccda2c9d +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..37addc6520be74117afa60ba40d4637e6fb8fdf0 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,476 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.992, + "eval_steps": 500, + "global_step": 62, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 0.9378038948956198, + "learning_rate": 0.0001, + "loss": 1.4769, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 0.8982165698182389, + "learning_rate": 0.0002, + "loss": 1.4233, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 1.287847751518021, + "learning_rate": 0.0001998629534754574, + "loss": 1.26, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 1.0001952815738493, + "learning_rate": 0.00019945218953682734, + "loss": 1.1422, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.6807775171811575, + "learning_rate": 0.00019876883405951377, + "loss": 0.985, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.5355248224421841, + "learning_rate": 0.00019781476007338058, + "loss": 1.0363, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.7570224306176186, + "learning_rate": 0.00019659258262890683, + "loss": 0.9551, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.4201298524092389, + "learning_rate": 0.00019510565162951537, + "loss": 0.9048, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.4980661254746478, + "learning_rate": 0.00019335804264972018, + "loss": 0.8679, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.47119688466611204, + "learning_rate": 0.0001913545457642601, + "loss": 0.9412, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.4957137205221804, + "learning_rate": 0.0001891006524188368, + "loss": 0.9211, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.4336212420660128, + "learning_rate": 0.00018660254037844388, + "loss": 0.8708, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.39597291719308214, + "learning_rate": 0.00018386705679454242, + "loss": 0.8782, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.38390249363179807, + "learning_rate": 0.00018090169943749476, + "loss": 0.9323, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.3990386813171272, + "learning_rate": 0.0001777145961456971, + "loss": 0.9005, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.36808856210178786, + "learning_rate": 0.00017431448254773944, + "loss": 0.8754, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.37805906793374583, + "learning_rate": 0.00017071067811865476, + "loss": 0.8996, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.325235795491243, + "learning_rate": 0.00016691306063588583, + "loss": 0.8416, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.3625820412095005, + "learning_rate": 0.00016293203910498376, + "loss": 0.8683, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.34192579542551826, + "learning_rate": 0.00015877852522924732, + "loss": 0.8072, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.37270480043786225, + "learning_rate": 0.00015446390350150273, + "loss": 0.8442, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.35160707209512304, + "learning_rate": 0.00015000000000000001, + "loss": 0.8611, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.3334062896076315, + "learning_rate": 0.00014539904997395468, + "loss": 0.8201, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.3906535203730245, + "learning_rate": 0.00014067366430758004, + "loss": 0.8188, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.3577973945208406, + "learning_rate": 0.00013583679495453, + "loss": 0.8321, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.30630218053753877, + "learning_rate": 0.00013090169943749476, + "loss": 0.7872, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.3148202096857398, + "learning_rate": 0.00012588190451025207, + "loss": 0.7564, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.3563247781912407, + "learning_rate": 0.00012079116908177593, + "loss": 0.8753, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.3187343500667551, + "learning_rate": 0.0001156434465040231, + "loss": 0.8261, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.32028276858301985, + "learning_rate": 0.00011045284632676536, + "loss": 0.7956, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.3283584125341734, + "learning_rate": 0.0001052335956242944, + "loss": 0.8078, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.3116804238757513, + "learning_rate": 0.0001, + "loss": 0.8297, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.34278266317249856, + "learning_rate": 9.476640437570562e-05, + "loss": 0.8239, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.30595761713026426, + "learning_rate": 8.954715367323468e-05, + "loss": 0.7906, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.3265249485330855, + "learning_rate": 8.435655349597689e-05, + "loss": 0.747, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.31382488863003205, + "learning_rate": 7.920883091822408e-05, + "loss": 0.8304, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.3619161183936626, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8553, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.32665860366084254, + "learning_rate": 6.909830056250527e-05, + "loss": 0.8052, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.3253192271402816, + "learning_rate": 6.416320504546997e-05, + "loss": 0.864, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.32960969117819583, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.8163, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.3232580501894908, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.8427, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.3129500054631124, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7694, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.3126921363286475, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.8014, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.2869463355408737, + "learning_rate": 4.12214747707527e-05, + "loss": 0.7895, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.29201630872249207, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.7443, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.33987577027712634, + "learning_rate": 3.308693936411421e-05, + "loss": 0.7886, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.3066583607284319, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7406, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.28152834918909564, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.7806, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.314371197533863, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.779, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.3236829247628598, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.8083, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.3402350100519127, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.7785, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.34165645874652545, + "learning_rate": 1.339745962155613e-05, + "loss": 0.8051, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.36808713946400523, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.8232, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.3154155924180855, + "learning_rate": 8.645454235739903e-06, + "loss": 0.8058, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.3014060919782915, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.7967, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.3309114005539564, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.7909, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.3849480424914164, + "learning_rate": 3.40741737109318e-06, + "loss": 0.8441, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.2793016248216318, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.7713, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.31012031023291514, + "learning_rate": 1.231165940486234e-06, + "loss": 0.7533, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.2877388900526754, + "learning_rate": 5.478104631726711e-07, + "loss": 0.7386, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.3159739171413289, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.7869, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.30635308472402023, + "learning_rate": 0.0, + "loss": 0.8289, + "step": 62 + }, + { + "epoch": 0.992, + "step": 62, + "total_flos": 156003815260160.0, + "train_loss": 0.8635893848634535, + "train_runtime": 1927.9959, + "train_samples_per_second": 1.037, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 62, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 156003815260160.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..08f415cfff38d20d76e34e89b5083acd15830dd4 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "q_proj", + "gate_proj", + "v_proj", + "down_proj", + "o_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a40787995328bf0352247bf7aae228eb6b2d5e92 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08db6ba76879f06b7866cf329aab4a2cf1bd74608b416e2835593b1261051b2b +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..ff6bffffb7792d517cd7ee9a65c7000e64a2e964 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f56b47db80cd8a3edb0691506eaf739f85f61245f28df904d99f6fdcec3138de +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f270cc7ca048104e2ee2f028352535399104b428 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.9300447735857797, + "learning_rate": 5e-05, + "loss": 1.4212, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.8941623954623172, + "learning_rate": 0.0001, + "loss": 1.4027, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.7866260657348403, + "learning_rate": 0.00015000000000000001, + "loss": 1.3411, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 1.2147238549075172, + "learning_rate": 0.0002, + "loss": 1.1727, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.1094493212868144, + "learning_rate": 0.00019996629653035126, + "loss": 0.9972, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.8828202767363533, + "learning_rate": 0.00019986520883988232, + "loss": 1.04, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.6522330755796834, + "learning_rate": 0.00019969680506871137, + "loss": 0.9753, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.4661735661736063, + "learning_rate": 0.00019946119873266613, + "loss": 0.8634, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.5330532854787262, + "learning_rate": 0.00019915854864676664, + "loss": 0.9423, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.5732343687041058, + "learning_rate": 0.00019878905881817252, + "loss": 0.9613, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.531922029530724, + "learning_rate": 0.00019835297830866826, + "loss": 0.9133, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.5695373512799271, + "learning_rate": 0.00019785060106677818, + "loss": 0.9241, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.48913628922300956, + "learning_rate": 0.00019728226572962473, + "loss": 0.8069, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.600731959286891, + "learning_rate": 0.0001966483553946637, + "loss": 0.913, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.4888121753484574, + "learning_rate": 0.00019594929736144976, + "loss": 0.8766, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.45838473336493163, + "learning_rate": 0.00019518556284360696, + "loss": 0.8702, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.5523016279235923, + "learning_rate": 0.0001943576666511982, + "loss": 0.9263, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 1.1236493815342523, + "learning_rate": 0.0001934661668437073, + "loss": 0.9017, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.9322800469491774, + "learning_rate": 0.0001925116643538684, + "loss": 0.896, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.47070914424960786, + "learning_rate": 0.00019149480258259533, + "loss": 0.8231, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.4502420347383483, + "learning_rate": 0.00019041626696528503, + "loss": 0.7832, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.42688173145056074, + "learning_rate": 0.0001892767845097864, + "loss": 0.8332, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.4416543143186891, + "learning_rate": 0.00018807712330634642, + "loss": 0.8217, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.42523897687726514, + "learning_rate": 0.0001868180920098644, + "loss": 0.8877, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.44128672624582566, + "learning_rate": 0.00018550053929480202, + "loss": 0.8929, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.4761215857258359, + "learning_rate": 0.00018412535328311814, + "loss": 0.7895, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.49618405427508133, + "learning_rate": 0.0001826934609456129, + "loss": 0.9193, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.5019616118160227, + "learning_rate": 0.00018120582747708502, + "loss": 0.8678, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.45325569465538845, + "learning_rate": 0.0001796634556457236, + "loss": 0.881, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.4328543000849896, + "learning_rate": 0.0001780673851171728, + "loss": 0.7984, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.5226044226129308, + "learning_rate": 0.00017641869175372493, + "loss": 0.8251, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.495514036412055, + "learning_rate": 0.00017471848688911464, + "loss": 0.879, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.42627767818455664, + "learning_rate": 0.000172967916579403, + "loss": 0.8623, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.447506945125031, + "learning_rate": 0.00017116816083045602, + "loss": 0.8433, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.4649905236967444, + "learning_rate": 0.0001693204328025389, + "loss": 0.8377, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.47770566393392916, + "learning_rate": 0.00016742597799256182, + "loss": 0.8305, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.3799474520513085, + "learning_rate": 0.00016548607339452853, + "loss": 0.7657, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.4616004668862163, + "learning_rate": 0.00016350202663875386, + "loss": 0.8183, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.46510246561157265, + "learning_rate": 0.0001614751751104301, + "loss": 0.9086, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.45510655180869813, + "learning_rate": 0.00015940688504813662, + "loss": 0.7559, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.4686610313953666, + "learning_rate": 0.00015729855062290022, + "loss": 0.8561, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.4985603334063983, + "learning_rate": 0.00015515159299842707, + "loss": 0.775, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.4763246817199902, + "learning_rate": 0.00015296745937313987, + "loss": 0.8489, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.47932086524083634, + "learning_rate": 0.00015074762200466556, + "loss": 0.8266, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.4140084452320215, + "learning_rate": 0.00014849357721743168, + "loss": 0.7732, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.5037644545683102, + "learning_rate": 0.00014620684439403962, + "loss": 0.8021, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.511721323509284, + "learning_rate": 0.0001438889649510956, + "loss": 0.7965, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.4453968832294159, + "learning_rate": 0.00014154150130018866, + "loss": 0.8072, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.45760166610633574, + "learning_rate": 0.00013916603579471705, + "loss": 0.7846, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.5687538636897856, + "learning_rate": 0.000136764169663272, + "loss": 0.8548, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.41556308383635204, + "learning_rate": 0.00013433752193029886, + "loss": 0.7858, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.4438322151444504, + "learning_rate": 0.00013188772832476188, + "loss": 0.8102, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.44920875101367225, + "learning_rate": 0.00012941644017754964, + "loss": 0.8663, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.4562243620080555, + "learning_rate": 0.00012692532330836346, + "loss": 0.7225, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.4670306606254095, + "learning_rate": 0.00012441605690283915, + "loss": 0.8674, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.38224700176824034, + "learning_rate": 0.0001218903323806595, + "loss": 0.7718, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.3958254451198586, + "learning_rate": 0.00011934985225541998, + "loss": 0.7967, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.43967961784262105, + "learning_rate": 0.00011679632898701649, + "loss": 0.8683, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.3960108199866462, + "learning_rate": 0.00011423148382732853, + "loss": 0.764, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.4290206261752591, + "learning_rate": 0.00011165704565997593, + "loss": 0.8361, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.37343933619751885, + "learning_rate": 0.00010907474983493144, + "loss": 0.6922, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.39076622437562314, + "learning_rate": 0.0001064863369987743, + "loss": 0.8454, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.45557849407523243, + "learning_rate": 0.00010389355192137377, + "loss": 0.8379, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.40510790499453986, + "learning_rate": 0.0001012981423197931, + "loss": 0.8088, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.42655439355809865, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7868, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.4274086029648102, + "learning_rate": 9.610644807862625e-05, + "loss": 0.7844, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.46709466970320407, + "learning_rate": 9.35136630012257e-05, + "loss": 0.8366, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.4251199525895881, + "learning_rate": 9.092525016506858e-05, + "loss": 0.8851, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.39032317585957393, + "learning_rate": 8.83429543400241e-05, + "loss": 0.7611, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.4032588998536183, + "learning_rate": 8.57685161726715e-05, + "loss": 0.8281, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.37713458352496687, + "learning_rate": 8.320367101298351e-05, + "loss": 0.7498, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.4672308735351815, + "learning_rate": 8.065014774458003e-05, + "loss": 0.8496, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.45563903888910406, + "learning_rate": 7.810966761934053e-05, + "loss": 0.8047, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.37397857860843065, + "learning_rate": 7.558394309716088e-05, + "loss": 0.7846, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.3962217079098133, + "learning_rate": 7.307467669163655e-05, + "loss": 0.8319, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.4017100232137172, + "learning_rate": 7.058355982245037e-05, + "loss": 0.7513, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.4268989423918388, + "learning_rate": 6.811227167523815e-05, + "loss": 0.7839, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.4079357701727051, + "learning_rate": 6.566247806970119e-05, + "loss": 0.7715, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.4107276956696718, + "learning_rate": 6.323583033672799e-05, + "loss": 0.776, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.46619861200542645, + "learning_rate": 6.083396420528298e-05, + "loss": 0.8987, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.44429947024548255, + "learning_rate": 5.845849869981137e-05, + "loss": 0.8965, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.4434324812385255, + "learning_rate": 5.611103504890444e-05, + "loss": 0.8088, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.3898226333525628, + "learning_rate": 5.379315560596038e-05, + "loss": 0.7228, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.40085891019440134, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.7796, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.3681891073144277, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.7464, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.3807853713661697, + "learning_rate": 4.703254062686017e-05, + "loss": 0.7585, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.4287329211744062, + "learning_rate": 4.484840700157295e-05, + "loss": 0.8006, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.4879386796896542, + "learning_rate": 4.270144937709981e-05, + "loss": 0.7965, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.5071758983580438, + "learning_rate": 4.059311495186338e-05, + "loss": 0.8691, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.3766842574391901, + "learning_rate": 3.852482488956992e-05, + "loss": 0.7865, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.377539197544411, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7921, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.44402436974537496, + "learning_rate": 3.45139266054715e-05, + "loss": 0.745, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.44449641877820356, + "learning_rate": 3.257402200743821e-05, + "loss": 0.7177, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.4382952546761995, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.7585, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.3999725274248031, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.8022, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.5635995549114391, + "learning_rate": 2.7032083420597e-05, + "loss": 0.869, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.45441045951837805, + "learning_rate": 2.528151311088537e-05, + "loss": 0.7141, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.45283175276984133, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.8029, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.41260626672347417, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.8141, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.46538759918111977, + "learning_rate": 2.03365443542764e-05, + "loss": 0.8154, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.3789292294833141, + "learning_rate": 1.879417252291502e-05, + "loss": 0.7809, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.41506839343478824, + "learning_rate": 1.730653905438714e-05, + "loss": 0.7874, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.3648788353955197, + "learning_rate": 1.587464671688187e-05, + "loss": 0.7499, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.4587909838246228, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.775, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.41463387337526747, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.82, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.394693008883327, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.7566, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.43539993591753107, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.8281, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.40760104117287915, + "learning_rate": 9.583733034714981e-06, + "loss": 0.7678, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.44161157683534574, + "learning_rate": 8.505197417404687e-06, + "loss": 0.8346, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.39973155716896946, + "learning_rate": 7.488335646131628e-06, + "loss": 0.8143, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.4248556182588792, + "learning_rate": 6.533833156292679e-06, + "loss": 0.7733, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.4040332779709177, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.7275, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.4130206933527327, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.7827, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.4331445858777044, + "learning_rate": 4.050702638550275e-06, + "loss": 0.769, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.3985474826850022, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.7983, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.4201917197177302, + "learning_rate": 2.717734270375272e-06, + "loss": 0.7817, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.3816079162312025, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.697, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.4266755473386918, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.8057, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.3622774541651628, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.744, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.41675537181777855, + "learning_rate": 8.41451353233369e-07, + "loss": 0.7613, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.6487814429829765, + "learning_rate": 5.388012673338661e-07, + "loss": 0.7765, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.4230934030543345, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7546, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.4167473753566679, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7033, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.37194711508688, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7665, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.4061978675006465, + "learning_rate": 0.0, + "loss": 0.7446, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 112068802379776.0, + "train_loss": 0.8331891593933105, + "train_runtime": 1965.4906, + "train_samples_per_second": 1.018, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 112068802379776.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ca5bf4b3175f756c017116c7f640cbe129e656d --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "v_proj", + "up_proj", + "down_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..792402cae76d3c1972b4f468e13ce7fb0237a240 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acedc725eb44a6d7255443bc027118a61e62213a28a736a085681fef8bc721bd +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..430e31bcd64c5ca3b20a5814af380429af9e51a3 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb14c93195a8ed1845b7ebb0917ba61adb75db6e4388d846a3863c7fdabe57c6 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3f69163d4d2b8e2f75597575e73f23ac0d525acc --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,476 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.992, + "eval_steps": 500, + "global_step": 62, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 0.843145838243195, + "learning_rate": 0.0001, + "loss": 1.4119, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 0.9380096803116589, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 0.9463053624227347, + "learning_rate": 0.0001998629534754574, + "loss": 1.2292, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 0.8489288880806201, + "learning_rate": 0.00019945218953682734, + "loss": 1.0505, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.5599243053149323, + "learning_rate": 0.00019876883405951377, + "loss": 1.0059, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.5443751929373011, + "learning_rate": 0.00019781476007338058, + "loss": 0.9792, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.42752971478789437, + "learning_rate": 0.00019659258262890683, + "loss": 0.8965, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.4032131719894276, + "learning_rate": 0.00019510565162951537, + "loss": 0.9045, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.6875225939389418, + "learning_rate": 0.00019335804264972018, + "loss": 0.9446, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.45195238831600093, + "learning_rate": 0.0001913545457642601, + "loss": 0.8833, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.36373689043217144, + "learning_rate": 0.0001891006524188368, + "loss": 0.8311, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.3723458886036847, + "learning_rate": 0.00018660254037844388, + "loss": 0.8731, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.3404123418563292, + "learning_rate": 0.00018386705679454242, + "loss": 0.8563, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.39141927900118156, + "learning_rate": 0.00018090169943749476, + "loss": 0.9065, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.3377913900856818, + "learning_rate": 0.0001777145961456971, + "loss": 0.8531, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.3516799806634102, + "learning_rate": 0.00017431448254773944, + "loss": 0.8616, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.34391919997614717, + "learning_rate": 0.00017071067811865476, + "loss": 0.8651, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.333954567934155, + "learning_rate": 0.00016691306063588583, + "loss": 0.8458, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.32630409233069796, + "learning_rate": 0.00016293203910498376, + "loss": 0.7994, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.3627179301532884, + "learning_rate": 0.00015877852522924732, + "loss": 0.842, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.3664591708206515, + "learning_rate": 0.00015446390350150273, + "loss": 0.8224, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.40521270822120725, + "learning_rate": 0.00015000000000000001, + "loss": 0.8444, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.37903876457944136, + "learning_rate": 0.00014539904997395468, + "loss": 0.8006, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.34049120165740693, + "learning_rate": 0.00014067366430758004, + "loss": 0.8079, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.3877286031540173, + "learning_rate": 0.00013583679495453, + "loss": 0.8292, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.31924225776411685, + "learning_rate": 0.00013090169943749476, + "loss": 0.8034, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.32758604465985947, + "learning_rate": 0.00012588190451025207, + "loss": 0.7981, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.3347640325790976, + "learning_rate": 0.00012079116908177593, + "loss": 0.8259, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.3296926636369933, + "learning_rate": 0.0001156434465040231, + "loss": 0.8455, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.367558043246359, + "learning_rate": 0.00011045284632676536, + "loss": 0.8063, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.32660814025542856, + "learning_rate": 0.0001052335956242944, + "loss": 0.7778, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.33057989221956413, + "learning_rate": 0.0001, + "loss": 0.8322, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.35495975739470687, + "learning_rate": 9.476640437570562e-05, + "loss": 0.7939, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.333883052997245, + "learning_rate": 8.954715367323468e-05, + "loss": 0.8724, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.5342512191259564, + "learning_rate": 8.435655349597689e-05, + "loss": 0.8032, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.30404137682103216, + "learning_rate": 7.920883091822408e-05, + "loss": 0.805, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.30505193619981874, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8082, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.2979225234091672, + "learning_rate": 6.909830056250527e-05, + "loss": 0.7991, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.3370269390268292, + "learning_rate": 6.416320504546997e-05, + "loss": 0.7908, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.33162910921880767, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.8453, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.34380203708081264, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.868, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.2915663459031845, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7608, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.28523801049541647, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.7654, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.3427716619184912, + "learning_rate": 4.12214747707527e-05, + "loss": 0.8163, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.3389826002732601, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.8405, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.283733099028701, + "learning_rate": 3.308693936411421e-05, + "loss": 0.7817, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.3172649852684197, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7519, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.360330762850194, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.851, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.3084947842667855, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.7728, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.3205822742446387, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.8271, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.29060856532813023, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.7972, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.30648749167538564, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7784, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.28288086095491943, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.8033, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.2989179635932655, + "learning_rate": 8.645454235739903e-06, + "loss": 0.8131, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.3315017220426919, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.84, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.32214020332980725, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.7667, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.3259571865204803, + "learning_rate": 3.40741737109318e-06, + "loss": 0.7922, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.3067289342641416, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.8022, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.3059666813346514, + "learning_rate": 1.231165940486234e-06, + "loss": 0.7679, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.3321644769509772, + "learning_rate": 5.478104631726711e-07, + "loss": 0.7719, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.3182538954435247, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.7828, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.29629475427484525, + "learning_rate": 0.0, + "loss": 0.7501, + "step": 62 + }, + { + "epoch": 0.992, + "step": 62, + "total_flos": 162629892046848.0, + "train_loss": 0.8567078267374346, + "train_runtime": 1951.7305, + "train_samples_per_second": 1.025, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 62, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 162629892046848.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..14dc4c5478c9fd3f4f0916a025c8fb02eba5d078 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "gate_proj", + "up_proj", + "v_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d0024979e8f6c705d6696a7e2a7ecd43c7556bbc --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7238743f849ee06b63a05a6609fc45e28096664df38b5c595e6c064ed952f54 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2ba09ffb6d0c6495764b3a54d99ef74a347d8603 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:482270ab678dd84cbecf4cb5dacecbb4bbe2ca6f150cb50aefd6bdfedd2ec3c9 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..66cf301ba336f5e6a03d1954b8fbe660ba20af65 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 1.2054836032909777, + "learning_rate": 5e-05, + "loss": 1.5017, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.954238989110196, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.7678466151736065, + "learning_rate": 0.00015000000000000001, + "loss": 1.283, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 1.6082417908446505, + "learning_rate": 0.0002, + "loss": 1.1555, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.0562286263106848, + "learning_rate": 0.00019996629653035126, + "loss": 1.0669, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.8265656805740538, + "learning_rate": 0.00019986520883988232, + "loss": 1.0068, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.7655842282381125, + "learning_rate": 0.00019969680506871137, + "loss": 0.9599, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.5777687757328043, + "learning_rate": 0.00019946119873266613, + "loss": 1.0017, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.721581419395825, + "learning_rate": 0.00019915854864676664, + "loss": 0.9135, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.542168928482532, + "learning_rate": 0.00019878905881817252, + "loss": 0.9624, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.6108376268031162, + "learning_rate": 0.00019835297830866826, + "loss": 0.9251, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.5244504646150466, + "learning_rate": 0.00019785060106677818, + "loss": 0.8855, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.5837393841633354, + "learning_rate": 0.00019728226572962473, + "loss": 0.9486, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.6146534952260426, + "learning_rate": 0.0001966483553946637, + "loss": 0.8341, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.5077809580036337, + "learning_rate": 0.00019594929736144976, + "loss": 0.8972, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.5351319800406077, + "learning_rate": 0.00019518556284360696, + "loss": 0.898, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.4815175525442992, + "learning_rate": 0.0001943576666511982, + "loss": 0.8511, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.45271754447994694, + "learning_rate": 0.0001934661668437073, + "loss": 0.8504, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.4781494508790699, + "learning_rate": 0.0001925116643538684, + "loss": 0.8543, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.48122762032392724, + "learning_rate": 0.00019149480258259533, + "loss": 0.823, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.4914747468410434, + "learning_rate": 0.00019041626696528503, + "loss": 0.8671, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.4733738820147795, + "learning_rate": 0.0001892767845097864, + "loss": 0.8808, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.47069335051320876, + "learning_rate": 0.00018807712330634642, + "loss": 0.8112, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.4738223832737001, + "learning_rate": 0.0001868180920098644, + "loss": 0.835, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.5064877458997864, + "learning_rate": 0.00018550053929480202, + "loss": 0.8642, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.44858291604670125, + "learning_rate": 0.00018412535328311814, + "loss": 0.854, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.5177837399916985, + "learning_rate": 0.0001826934609456129, + "loss": 0.8771, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.5501566931582197, + "learning_rate": 0.00018120582747708502, + "loss": 0.8715, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.4316046486398667, + "learning_rate": 0.0001796634556457236, + "loss": 0.8968, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.42763874529306595, + "learning_rate": 0.0001780673851171728, + "loss": 0.8705, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.4450128699392376, + "learning_rate": 0.00017641869175372493, + "loss": 0.8039, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.46132404658319076, + "learning_rate": 0.00017471848688911464, + "loss": 0.8485, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.4784554434315401, + "learning_rate": 0.000172967916579403, + "loss": 0.8429, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.42929968246494943, + "learning_rate": 0.00017116816083045602, + "loss": 0.8193, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.39407240093282986, + "learning_rate": 0.0001693204328025389, + "loss": 0.7664, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.5640849704230063, + "learning_rate": 0.00016742597799256182, + "loss": 0.86, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.5892482999779673, + "learning_rate": 0.00016548607339452853, + "loss": 0.9928, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.4078559973229823, + "learning_rate": 0.00016350202663875386, + "loss": 0.711, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.43250872905374294, + "learning_rate": 0.0001614751751104301, + "loss": 0.7459, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.44988474520884375, + "learning_rate": 0.00015940688504813662, + "loss": 0.7682, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.4666212178261856, + "learning_rate": 0.00015729855062290022, + "loss": 0.7974, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.4474201623249179, + "learning_rate": 0.00015515159299842707, + "loss": 0.8009, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.6773915402946965, + "learning_rate": 0.00015296745937313987, + "loss": 0.9602, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.5060905150524829, + "learning_rate": 0.00015074762200466556, + "loss": 0.8342, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.4772552860058863, + "learning_rate": 0.00014849357721743168, + "loss": 0.8891, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.46843358870389834, + "learning_rate": 0.00014620684439403962, + "loss": 0.8455, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.5439841894696938, + "learning_rate": 0.0001438889649510956, + "loss": 0.7722, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.43909813467352327, + "learning_rate": 0.00014154150130018866, + "loss": 0.7935, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.4994615028953572, + "learning_rate": 0.00013916603579471705, + "loss": 0.8643, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.42557838435232637, + "learning_rate": 0.000136764169663272, + "loss": 0.8597, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.4464800320333645, + "learning_rate": 0.00013433752193029886, + "loss": 0.8157, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.4154749453225568, + "learning_rate": 0.00013188772832476188, + "loss": 0.7622, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.4423818247673819, + "learning_rate": 0.00012941644017754964, + "loss": 0.8785, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.48505813932469505, + "learning_rate": 0.00012692532330836346, + "loss": 0.8218, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.5297224593394644, + "learning_rate": 0.00012441605690283915, + "loss": 0.9302, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.4172622442182296, + "learning_rate": 0.0001218903323806595, + "loss": 0.8525, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.6155680038093249, + "learning_rate": 0.00011934985225541998, + "loss": 0.7678, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.46909148631310976, + "learning_rate": 0.00011679632898701649, + "loss": 0.8107, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.4879035231063422, + "learning_rate": 0.00011423148382732853, + "loss": 0.8199, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.43485709638018905, + "learning_rate": 0.00011165704565997593, + "loss": 0.7881, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.4448453191087327, + "learning_rate": 0.00010907474983493144, + "loss": 0.7868, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.5306331930728724, + "learning_rate": 0.0001064863369987743, + "loss": 0.8523, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.4992968847325637, + "learning_rate": 0.00010389355192137377, + "loss": 0.7766, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.38789752705861585, + "learning_rate": 0.0001012981423197931, + "loss": 0.741, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.41232002134377704, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7716, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.4104927091301637, + "learning_rate": 9.610644807862625e-05, + "loss": 0.791, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.4536062964116601, + "learning_rate": 9.35136630012257e-05, + "loss": 0.8224, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.4209000797113228, + "learning_rate": 9.092525016506858e-05, + "loss": 0.826, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.43435410119915097, + "learning_rate": 8.83429543400241e-05, + "loss": 0.8052, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.3718755656365949, + "learning_rate": 8.57685161726715e-05, + "loss": 0.7323, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.3996307000914964, + "learning_rate": 8.320367101298351e-05, + "loss": 0.7496, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.40822290274043077, + "learning_rate": 8.065014774458003e-05, + "loss": 0.7065, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.45131242199847604, + "learning_rate": 7.810966761934053e-05, + "loss": 0.768, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.45472522741851235, + "learning_rate": 7.558394309716088e-05, + "loss": 0.7355, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.4284582880321103, + "learning_rate": 7.307467669163655e-05, + "loss": 0.8072, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.4077950430987271, + "learning_rate": 7.058355982245037e-05, + "loss": 0.7883, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.443123826971581, + "learning_rate": 6.811227167523815e-05, + "loss": 0.7785, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.42191143033368195, + "learning_rate": 6.566247806970119e-05, + "loss": 0.8033, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.544556846459226, + "learning_rate": 6.323583033672799e-05, + "loss": 0.837, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.3897980689268285, + "learning_rate": 6.083396420528298e-05, + "loss": 0.7923, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.4340545855141479, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7744, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.49582655199280046, + "learning_rate": 5.611103504890444e-05, + "loss": 0.7324, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.4238385179654696, + "learning_rate": 5.379315560596038e-05, + "loss": 0.7767, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.5308351549134397, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.8437, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.4110401224429797, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.7636, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.4873911227349872, + "learning_rate": 4.703254062686017e-05, + "loss": 0.8241, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.42240756526435774, + "learning_rate": 4.484840700157295e-05, + "loss": 0.7083, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.42036965303141166, + "learning_rate": 4.270144937709981e-05, + "loss": 0.8498, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.3878007767118561, + "learning_rate": 4.059311495186338e-05, + "loss": 0.7921, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.5086889199469316, + "learning_rate": 3.852482488956992e-05, + "loss": 0.8485, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.38477687023795526, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7613, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.46085964244404704, + "learning_rate": 3.45139266054715e-05, + "loss": 0.7689, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.4071525230768036, + "learning_rate": 3.257402200743821e-05, + "loss": 0.8064, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.6214994476872179, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.7933, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.48118329615371713, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.7912, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.39560748056930567, + "learning_rate": 2.7032083420597e-05, + "loss": 0.755, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.3600598201768951, + "learning_rate": 2.528151311088537e-05, + "loss": 0.7395, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.4652092977802121, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7723, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.48845591366553076, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7768, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.4114611715018722, + "learning_rate": 2.03365443542764e-05, + "loss": 0.8183, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.37871588744645535, + "learning_rate": 1.879417252291502e-05, + "loss": 0.8128, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.3919736583377089, + "learning_rate": 1.730653905438714e-05, + "loss": 0.7329, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.4469471117249365, + "learning_rate": 1.587464671688187e-05, + "loss": 0.834, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.468934964665669, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.7651, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.48844250247818427, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.7297, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.4524364309414198, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.8363, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.45213211484185767, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.763, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.46723958862108017, + "learning_rate": 9.583733034714981e-06, + "loss": 0.7722, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.43863412802839247, + "learning_rate": 8.505197417404687e-06, + "loss": 0.7547, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.3953813072931627, + "learning_rate": 7.488335646131628e-06, + "loss": 0.795, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.4415874074419842, + "learning_rate": 6.533833156292679e-06, + "loss": 0.7514, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.42128134880801454, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.7878, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.3788662324527675, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.7441, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.4286441840661642, + "learning_rate": 4.050702638550275e-06, + "loss": 0.7478, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.45115458519962515, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.818, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.4211270204030477, + "learning_rate": 2.717734270375272e-06, + "loss": 0.811, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.3847476912513861, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.8067, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.41440864739044353, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.805, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.4303158521887713, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.8265, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.38697710842550026, + "learning_rate": 8.41451353233369e-07, + "loss": 0.7795, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.4391608267010154, + "learning_rate": 5.388012673338661e-07, + "loss": 0.8082, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.5265387648782383, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7479, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.41646004486406024, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7377, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.4877047827862541, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7319, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.45270307907667634, + "learning_rate": 0.0, + "loss": 0.7682, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 110004521598976.0, + "train_loss": 0.8352149829864502, + "train_runtime": 1951.6222, + "train_samples_per_second": 1.025, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 110004521598976.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f7b4f2b72c113b4dbf491f048e0cdfca81e6b93d --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "down_proj", + "q_proj", + "o_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13bf1da3339716dade0983b19e397dd49390d28c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2aafd6948ca148902b2730fe8e1d95c1571e4c2c55ae20b09385ef8b1051908 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..40f2a2fca64ed7d3c4d0ec15bf4004c483212968 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84dc84f62ee656a692e7ad364457c586b7cb19189958b0beefc8b064215fa3eb +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..09912d963c5c8b138e517f69057dcd75e5239775 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,476 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.992, + "eval_steps": 500, + "global_step": 62, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 0.9925947841990691, + "learning_rate": 0.0001, + "loss": 1.4992, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 0.9063043848088819, + "learning_rate": 0.0002, + "loss": 1.413, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 1.0488931165337085, + "learning_rate": 0.0001998629534754574, + "loss": 1.2539, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 0.9333388718058, + "learning_rate": 0.00019945218953682734, + "loss": 1.116, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.7972030303434287, + "learning_rate": 0.00019876883405951377, + "loss": 1.0013, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.48274056578620395, + "learning_rate": 0.00019781476007338058, + "loss": 0.9533, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.5434712430562691, + "learning_rate": 0.00019659258262890683, + "loss": 0.9351, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.41458095983795856, + "learning_rate": 0.00019510565162951537, + "loss": 0.9267, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.38026363031320687, + "learning_rate": 0.00019335804264972018, + "loss": 0.8753, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.3959045312773757, + "learning_rate": 0.0001913545457642601, + "loss": 0.8596, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.4235300919941064, + "learning_rate": 0.0001891006524188368, + "loss": 0.8921, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.38628932587744996, + "learning_rate": 0.00018660254037844388, + "loss": 0.8455, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.44369543041418, + "learning_rate": 0.00018386705679454242, + "loss": 0.8781, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.4109843283491267, + "learning_rate": 0.00018090169943749476, + "loss": 0.8934, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.3749483097846325, + "learning_rate": 0.0001777145961456971, + "loss": 0.8951, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.37971805656126606, + "learning_rate": 0.00017431448254773944, + "loss": 0.8409, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.3754156243492605, + "learning_rate": 0.00017071067811865476, + "loss": 0.843, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.38235046938512196, + "learning_rate": 0.00016691306063588583, + "loss": 0.8271, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.35465995333344363, + "learning_rate": 0.00016293203910498376, + "loss": 0.8661, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.32753934791861467, + "learning_rate": 0.00015877852522924732, + "loss": 0.7687, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.33113997504958703, + "learning_rate": 0.00015446390350150273, + "loss": 0.8097, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.4136216149035267, + "learning_rate": 0.00015000000000000001, + "loss": 0.91, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.35190712822057785, + "learning_rate": 0.00014539904997395468, + "loss": 0.8756, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.34968365505535903, + "learning_rate": 0.00014067366430758004, + "loss": 0.7906, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.39304907579519915, + "learning_rate": 0.00013583679495453, + "loss": 0.8758, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.31310585822396536, + "learning_rate": 0.00013090169943749476, + "loss": 0.7944, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.355537005877262, + "learning_rate": 0.00012588190451025207, + "loss": 0.8596, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.3756637193077202, + "learning_rate": 0.00012079116908177593, + "loss": 0.9009, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.32107783868074957, + "learning_rate": 0.0001156434465040231, + "loss": 0.7966, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.3259594093549215, + "learning_rate": 0.00011045284632676536, + "loss": 0.8079, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.3311002448589628, + "learning_rate": 0.0001052335956242944, + "loss": 0.8255, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.3052727717674524, + "learning_rate": 0.0001, + "loss": 0.7639, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.3337694649290884, + "learning_rate": 9.476640437570562e-05, + "loss": 0.7875, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.3263404508721736, + "learning_rate": 8.954715367323468e-05, + "loss": 0.8333, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.3029279849900034, + "learning_rate": 8.435655349597689e-05, + "loss": 0.7776, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.31611457076476746, + "learning_rate": 7.920883091822408e-05, + "loss": 0.7446, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.323006284197396, + "learning_rate": 7.411809548974792e-05, + "loss": 0.7605, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.3148016363568966, + "learning_rate": 6.909830056250527e-05, + "loss": 0.8099, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.3473242390582786, + "learning_rate": 6.416320504546997e-05, + "loss": 0.8058, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.3550223861193946, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.8288, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.2994454014403082, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.7598, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.33866595315124687, + "learning_rate": 5.000000000000002e-05, + "loss": 0.824, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.3259479229606084, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.8082, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.3150729309963534, + "learning_rate": 4.12214747707527e-05, + "loss": 0.792, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.32722828375141205, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.8406, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.3159616589658377, + "learning_rate": 3.308693936411421e-05, + "loss": 0.7793, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.32912979548147964, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.8127, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.28949891628996977, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.7895, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.3166894220074553, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.772, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.33068916261585946, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.8098, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.29806269029528093, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.7874, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.34900463428322714, + "learning_rate": 1.339745962155613e-05, + "loss": 0.8179, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.3414454707988493, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.7967, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.33107800165050943, + "learning_rate": 8.645454235739903e-06, + "loss": 0.7805, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.3141353833466967, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.7911, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.3116391507758903, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.7838, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.3166462645242051, + "learning_rate": 3.40741737109318e-06, + "loss": 0.762, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.32702334706441605, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.8318, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.3110840740582141, + "learning_rate": 1.231165940486234e-06, + "loss": 0.8228, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.2997913312758848, + "learning_rate": 5.478104631726711e-07, + "loss": 0.8228, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.33605354873969534, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.7955, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.3031227746472349, + "learning_rate": 0.0, + "loss": 0.7533, + "step": 62 + }, + { + "epoch": 0.992, + "step": 62, + "total_flos": 157753171902464.0, + "train_loss": 0.8592852305981421, + "train_runtime": 1934.1804, + "train_samples_per_second": 1.034, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 62, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 157753171902464.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..854add7ac5db7e3e908ed889d540ad1779eff2a4 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "gate_proj", + "o_proj", + "down_proj", + "up_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e6b6d6636eae72163b2f1ef83595a0e963fad996 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6988dc6825c7f45d2b7f3c9c8570387c493c88e64abe1ddf9f219e63b235e85c +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..f98d2916fcb473e77ae7f374fe59b7ef47db98d7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65b1c8ec25756d095485d794acaf39c499fd58ee505eeeeabcd5bae8d1614f01 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..45739e4f509123aa2a4bd28da39bdcd56e091204 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.9048587630560018, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.3101, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.0519343911287062, + "learning_rate": 7.017543859649123e-06, + "loss": 1.5194, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 1.0224268066373816, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4268, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.8656445885930367, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.3686, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.8733966115001257, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.39, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.7783883541710935, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2797, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.8561524970209158, + "learning_rate": 2.456140350877193e-05, + "loss": 1.3017, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 1.0927395440249963, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.321, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8281037473311745, + "learning_rate": 3.157894736842105e-05, + "loss": 1.2032, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.807714070924002, + "learning_rate": 3.508771929824561e-05, + "loss": 1.0587, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 1.0483239118173178, + "learning_rate": 3.859649122807018e-05, + "loss": 1.0284, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8507389995888006, + "learning_rate": 4.210526315789474e-05, + "loss": 1.0391, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.8208532938213147, + "learning_rate": 4.56140350877193e-05, + "loss": 0.9867, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.8334380171945804, + "learning_rate": 4.912280701754386e-05, + "loss": 0.9704, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 0.7429403910377161, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.9036, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.7967026196736772, + "learning_rate": 5.6140350877192984e-05, + "loss": 1.0404, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.6848930255684692, + "learning_rate": 5.9649122807017544e-05, + "loss": 0.9056, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6346302299463533, + "learning_rate": 6.31578947368421e-05, + "loss": 0.8942, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.6535131512797249, + "learning_rate": 6.666666666666667e-05, + "loss": 0.8365, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.6509491421625535, + "learning_rate": 7.017543859649122e-05, + "loss": 0.9277, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.4955595210807453, + "learning_rate": 7.368421052631579e-05, + "loss": 0.8423, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.5361172069164036, + "learning_rate": 7.719298245614036e-05, + "loss": 0.9436, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.5973579229429707, + "learning_rate": 8.070175438596491e-05, + "loss": 0.8421, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5849298010283289, + "learning_rate": 8.421052631578948e-05, + "loss": 0.8671, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.5759168499300029, + "learning_rate": 8.771929824561403e-05, + "loss": 0.8499, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.515738534877068, + "learning_rate": 9.12280701754386e-05, + "loss": 0.9084, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.6110132431887146, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9516, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.5725801143409102, + "learning_rate": 9.824561403508771e-05, + "loss": 0.8457, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5012871976844285, + "learning_rate": 0.0001017543859649123, + "loss": 0.8426, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.5340718478626321, + "learning_rate": 0.00010526315789473685, + "loss": 0.8921, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.6221479080631429, + "learning_rate": 0.00010877192982456141, + "loss": 0.8718, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.47340491615336217, + "learning_rate": 0.00011228070175438597, + "loss": 0.8138, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5906496115820806, + "learning_rate": 0.00011578947368421053, + "loss": 0.8141, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.5911771567770389, + "learning_rate": 0.00011929824561403509, + "loss": 0.8854, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.8272434744116125, + "learning_rate": 0.00012280701754385965, + "loss": 0.9541, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5348620169937606, + "learning_rate": 0.0001263157894736842, + "loss": 0.8495, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.4862616162975578, + "learning_rate": 0.0001298245614035088, + "loss": 0.837, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.559516838035242, + "learning_rate": 0.00013333333333333334, + "loss": 0.8702, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5831454319541884, + "learning_rate": 0.0001368421052631579, + "loss": 0.8287, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.527006776442556, + "learning_rate": 0.00014035087719298245, + "loss": 0.7676, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.5946342001260494, + "learning_rate": 0.00014385964912280703, + "loss": 0.8255, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5359987813729276, + "learning_rate": 0.00014736842105263158, + "loss": 0.8617, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.532072440256019, + "learning_rate": 0.00015087719298245616, + "loss": 0.8779, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.5430218364120416, + "learning_rate": 0.0001543859649122807, + "loss": 0.8794, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.4524285119863887, + "learning_rate": 0.00015789473684210527, + "loss": 0.7943, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.4634163070353862, + "learning_rate": 0.00016140350877192982, + "loss": 0.7873, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.49667886103870035, + "learning_rate": 0.0001649122807017544, + "loss": 0.7983, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5049931634295179, + "learning_rate": 0.00016842105263157895, + "loss": 0.7981, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.4568261725717611, + "learning_rate": 0.00017192982456140353, + "loss": 0.8456, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 1.045715299468113, + "learning_rate": 0.00017543859649122806, + "loss": 0.8974, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.48309628455544334, + "learning_rate": 0.00017894736842105264, + "loss": 0.8137, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.5679687118692228, + "learning_rate": 0.0001824561403508772, + "loss": 0.8756, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.520824672017796, + "learning_rate": 0.00018596491228070177, + "loss": 0.8069, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5430786799958935, + "learning_rate": 0.00018947368421052632, + "loss": 0.8539, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.5525254109241546, + "learning_rate": 0.00019298245614035088, + "loss": 0.8129, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.5377953868559492, + "learning_rate": 0.00019649122807017543, + "loss": 0.8489, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.40336792195406596, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.5044050444585633, + "learning_rate": 0.00019999985069241055, + "loss": 0.8726, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.517946310570053, + "learning_rate": 0.00019999940277008808, + "loss": 0.8351, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.45762285553642407, + "learning_rate": 0.00019999865623437013, + "loss": 0.7832, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.49520896027565536, + "learning_rate": 0.00019999761108748597, + "loss": 0.7762, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.4841734921934868, + "learning_rate": 0.00019999626733255662, + "loss": 0.7769, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5308062158451023, + "learning_rate": 0.00019999462497359466, + "loss": 0.9068, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.491633130500264, + "learning_rate": 0.00019999268401550447, + "loss": 0.8502, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.44318703361045014, + "learning_rate": 0.000199990444464082, + "loss": 0.8117, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.39215294250553073, + "learning_rate": 0.00019998790632601496, + "loss": 0.7808, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.4675337591215834, + "learning_rate": 0.00019998506960888256, + "loss": 0.8424, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5095405074681937, + "learning_rate": 0.00019998193432115572, + "loss": 0.7829, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.48681538869656305, + "learning_rate": 0.0001999785004721968, + "loss": 0.8225, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.4204760302318927, + "learning_rate": 0.00019997476807225985, + "loss": 0.7648, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.5229961269669894, + "learning_rate": 0.0001999707371324904, + "loss": 0.8768, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5443752475655127, + "learning_rate": 0.00019996640766492543, + "loss": 0.7805, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.5127777436976269, + "learning_rate": 0.00019996177968249334, + "loss": 0.8312, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.4867549804722123, + "learning_rate": 0.0001999568531990141, + "loss": 0.8295, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.48800025181975715, + "learning_rate": 0.00019995162822919883, + "loss": 0.9258, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.49668936103675393, + "learning_rate": 0.00019994610478865011, + "loss": 0.8858, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.46659150620401435, + "learning_rate": 0.0001999402828938618, + "loss": 0.8052, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4743344063668828, + "learning_rate": 0.00019993416256221895, + "loss": 0.8172, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.4869439291640609, + "learning_rate": 0.00019992774381199778, + "loss": 0.7756, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.45117175186769337, + "learning_rate": 0.00019992102666236566, + "loss": 0.841, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5395601612912219, + "learning_rate": 0.00019991401113338104, + "loss": 0.8084, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.44014531925221906, + "learning_rate": 0.00019990669724599336, + "loss": 0.7761, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.45721365845663203, + "learning_rate": 0.00019989908502204292, + "loss": 0.8421, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.43168167968243937, + "learning_rate": 0.00019989117448426108, + "loss": 0.8152, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.48439026237325494, + "learning_rate": 0.00019988296565626987, + "loss": 0.8061, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.4314723559695366, + "learning_rate": 0.00019987445856258206, + "loss": 0.791, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.45774727472954585, + "learning_rate": 0.00019986565322860115, + "loss": 0.7741, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.5013043190046269, + "learning_rate": 0.00019985654968062122, + "loss": 0.8525, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.4160355669759571, + "learning_rate": 0.00019984714794582683, + "loss": 0.8043, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.5359859927158864, + "learning_rate": 0.00019983744805229296, + "loss": 0.8082, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.4991768151741999, + "learning_rate": 0.000199827450028985, + "loss": 0.7806, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.4480616596528081, + "learning_rate": 0.00019981715390575858, + "loss": 0.7346, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5734125655859981, + "learning_rate": 0.00019980655971335945, + "loss": 0.807, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.4390162171856093, + "learning_rate": 0.00019979566748342347, + "loss": 0.7387, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.47565640795131414, + "learning_rate": 0.00019978447724847652, + "loss": 0.8039, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5507870634162346, + "learning_rate": 0.00019977298904193437, + "loss": 0.8111, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.537742020914287, + "learning_rate": 0.00019976120289810247, + "loss": 0.8338, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.5145928182957982, + "learning_rate": 0.00019974911885217608, + "loss": 0.7749, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4570558282445582, + "learning_rate": 0.00019973673694024, + "loss": 0.838, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.4799901830500369, + "learning_rate": 0.0001997240571992685, + "loss": 0.786, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.5251908959822083, + "learning_rate": 0.00019971107966712518, + "loss": 0.7709, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5531788343340106, + "learning_rate": 0.00019969780438256293, + "loss": 0.8997, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.4966213467509509, + "learning_rate": 0.0001996842313852238, + "loss": 0.7913, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.4745035073937513, + "learning_rate": 0.00019967036071563877, + "loss": 0.7176, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.5021042926736523, + "learning_rate": 0.0001996561924152278, + "loss": 0.7945, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.47409043153267333, + "learning_rate": 0.0001996417265262996, + "loss": 0.7728, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.47479390037974667, + "learning_rate": 0.00019962696309205148, + "loss": 0.8747, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.465138144925051, + "learning_rate": 0.0001996119021565693, + "loss": 0.8798, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.49635684069772074, + "learning_rate": 0.0001995965437648273, + "loss": 0.8658, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.4346501860458164, + "learning_rate": 0.00019958088796268793, + "loss": 0.7793, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.45969112593289096, + "learning_rate": 0.0001995649347969019, + "loss": 0.821, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.44301514811842374, + "learning_rate": 0.00019954868431510764, + "loss": 0.7998, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.5218249227729164, + "learning_rate": 0.00019953213656583168, + "loss": 0.8215, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4531048118939405, + "learning_rate": 0.00019951529159848805, + "loss": 0.789, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.48172721813636576, + "learning_rate": 0.00019949814946337838, + "loss": 0.8278, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.5548702921280615, + "learning_rate": 0.00019948071021169174, + "loss": 0.8377, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4782174625773169, + "learning_rate": 0.00019946297389550433, + "loss": 0.8071, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.48008735393392793, + "learning_rate": 0.00019944494056777946, + "loss": 0.7957, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.4846278088312286, + "learning_rate": 0.00019942661028236745, + "loss": 0.8309, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.43094627314402284, + "learning_rate": 0.00019940798309400526, + "loss": 0.7827, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.49785114222657906, + "learning_rate": 0.00019938905905831654, + "loss": 0.9121, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.4529648152248877, + "learning_rate": 0.00019936983823181132, + "loss": 0.8266, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.44916565811300524, + "learning_rate": 0.0001993503206718859, + "loss": 0.8329, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.4058584416064084, + "learning_rate": 0.00019933050643682269, + "loss": 0.7247, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.46997145543886404, + "learning_rate": 0.00019931039558578997, + "loss": 0.8424, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4293882737620628, + "learning_rate": 0.00019928998817884182, + "loss": 0.842, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.5413296014362735, + "learning_rate": 0.00019926928427691786, + "loss": 0.8283, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.5018749189961162, + "learning_rate": 0.00019924828394184306, + "loss": 0.7647, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 1.1300842004771448, + "learning_rate": 0.00019922698723632767, + "loss": 0.8378, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.4669099932081055, + "learning_rate": 0.0001992053942239668, + "loss": 0.7556, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.5034657684944928, + "learning_rate": 0.0001991835049692405, + "loss": 0.8188, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6057387263076913, + "learning_rate": 0.00019916131953751342, + "loss": 0.8791, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.4239059530916577, + "learning_rate": 0.0001991388379950346, + "loss": 0.6973, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.7760985161052981, + "learning_rate": 0.0001991160604089374, + "loss": 0.8118, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.477661473194432, + "learning_rate": 0.00019909298684723904, + "loss": 0.7303, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.4897224336939851, + "learning_rate": 0.00019906961737884077, + "loss": 0.7916, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.4943394136702606, + "learning_rate": 0.00019904595207352737, + "loss": 0.8159, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.47864484082798453, + "learning_rate": 0.00019902199100196697, + "loss": 0.7898, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.44712867207442375, + "learning_rate": 0.000198997734235711, + "loss": 0.8483, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.5026585442490596, + "learning_rate": 0.00019897318184719385, + "loss": 0.8696, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.44196458116495235, + "learning_rate": 0.00019894833390973266, + "loss": 0.7583, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.4236005733025835, + "learning_rate": 0.0001989231904975272, + "loss": 0.736, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.46361836681455065, + "learning_rate": 0.00019889775168565943, + "loss": 0.7891, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5408348194034053, + "learning_rate": 0.00019887201755009357, + "loss": 0.7486, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.47465928218004905, + "learning_rate": 0.00019884598816767563, + "loss": 0.83, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.4484263671568565, + "learning_rate": 0.0001988196636161333, + "loss": 0.7349, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.43818164281104116, + "learning_rate": 0.0001987930439740757, + "loss": 0.7336, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.4813353284056389, + "learning_rate": 0.00019876612932099308, + "loss": 0.828, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.511981521156402, + "learning_rate": 0.0001987389197372567, + "loss": 0.8613, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.5647154584679596, + "learning_rate": 0.00019871141530411853, + "loss": 0.7679, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.44305957564766546, + "learning_rate": 0.00019868361610371097, + "loss": 0.7845, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.4562664206946426, + "learning_rate": 0.00019865552221904665, + "loss": 0.794, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4543083736456288, + "learning_rate": 0.0001986271337340182, + "loss": 0.7812, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.460607580393084, + "learning_rate": 0.00019859845073339787, + "loss": 0.7517, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.4805994336463568, + "learning_rate": 0.00019856947330283752, + "loss": 0.8167, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5223439324811999, + "learning_rate": 0.00019854020152886814, + "loss": 0.819, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.4707824952310109, + "learning_rate": 0.0001985106354988997, + "loss": 0.8005, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.4542483658151442, + "learning_rate": 0.00019848077530122083, + "loss": 0.7088, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.4757409168949553, + "learning_rate": 0.0001984506210249986, + "loss": 0.7295, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.5502377382211688, + "learning_rate": 0.00019842017276027832, + "loss": 0.8607, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.4735340368833073, + "learning_rate": 0.00019838943059798304, + "loss": 0.7139, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4148398649639404, + "learning_rate": 0.00019835839462991361, + "loss": 0.7629, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.4821367727124406, + "learning_rate": 0.0001983270649487481, + "loss": 0.8125, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.44594815157373763, + "learning_rate": 0.0001982954416480417, + "loss": 0.7765, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.4730271181666054, + "learning_rate": 0.00019826352482222638, + "loss": 0.8532, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.421433524862573, + "learning_rate": 0.00019823131456661063, + "loss": 0.7891, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.4816306325674117, + "learning_rate": 0.00019819881097737915, + "loss": 0.8018, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5062506020880575, + "learning_rate": 0.00019816601415159263, + "loss": 0.799, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.4890455686153939, + "learning_rate": 0.00019813292418718732, + "loss": 0.8148, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.508208932915509, + "learning_rate": 0.0001980995411829749, + "loss": 0.847, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4751378476016209, + "learning_rate": 0.0001980658652386421, + "loss": 0.8417, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.4308596783807121, + "learning_rate": 0.0001980318964547504, + "loss": 0.7949, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.44378724952748994, + "learning_rate": 0.0001979976349327357, + "loss": 0.7418, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.47251508899567907, + "learning_rate": 0.00019796308077490817, + "loss": 0.776, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.4719056631507855, + "learning_rate": 0.00019792823408445174, + "loss": 0.8101, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.5154067580332798, + "learning_rate": 0.0001978930949654239, + "loss": 0.7865, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4774614610268319, + "learning_rate": 0.00019785766352275542, + "loss": 0.7611, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.6279916872210253, + "learning_rate": 0.00019782193986224995, + "loss": 0.7965, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.5368628256054744, + "learning_rate": 0.00019778592409058378, + "loss": 0.8515, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.5116000985794409, + "learning_rate": 0.00019774961631530545, + "loss": 0.8993, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.45360288544505306, + "learning_rate": 0.0001977130166448355, + "loss": 0.7928, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.43260462670796207, + "learning_rate": 0.00019767612518846608, + "loss": 0.7726, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4708179747920768, + "learning_rate": 0.00019763894205636072, + "loss": 0.8254, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.5668599401104251, + "learning_rate": 0.00019760146735955388, + "loss": 0.845, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.46800930216492326, + "learning_rate": 0.00019756370120995066, + "loss": 0.7767, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5142748820218108, + "learning_rate": 0.00019752564372032657, + "loss": 0.7425, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.4495781555254168, + "learning_rate": 0.000197487295004327, + "loss": 0.7721, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.4398194024113899, + "learning_rate": 0.00019744865517646706, + "loss": 0.7554, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.46769385969747856, + "learning_rate": 0.00019740972435213115, + "loss": 0.7729, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.4499968242364586, + "learning_rate": 0.0001973705026475726, + "loss": 0.835, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.4943486759509741, + "learning_rate": 0.00019733099017991341, + "loss": 0.8232, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5371693975622313, + "learning_rate": 0.00019729118706714375, + "loss": 0.7985, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.4479662076166779, + "learning_rate": 0.0001972510934281218, + "loss": 0.7876, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.4774784162682743, + "learning_rate": 0.00019721070938257324, + "loss": 0.7794, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.47837892223465484, + "learning_rate": 0.00019717003505109095, + "loss": 0.7131, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.5047249526760721, + "learning_rate": 0.0001971290705551347, + "loss": 0.7822, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.49697405282551316, + "learning_rate": 0.00019708781601703065, + "loss": 0.7668, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.486842951541938, + "learning_rate": 0.00019704627155997108, + "loss": 0.8311, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.5117164630412789, + "learning_rate": 0.00019700443730801413, + "loss": 0.8394, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.4430229505434961, + "learning_rate": 0.00019696231338608316, + "loss": 0.8238, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.49889594388044983, + "learning_rate": 0.00019691989991996663, + "loss": 0.8521, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.4265568951015607, + "learning_rate": 0.00019687719703631755, + "loss": 0.7592, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.4719272197684277, + "learning_rate": 0.00019683420486265327, + "loss": 0.7673, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4625187503128492, + "learning_rate": 0.0001967909235273549, + "loss": 0.6884, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.5439453074539425, + "learning_rate": 0.0001967473531596671, + "loss": 0.7456, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.5091163294240805, + "learning_rate": 0.0001967034938896976, + "loss": 0.7669, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.48946230838595484, + "learning_rate": 0.00019665934584841682, + "loss": 0.7849, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.4553021564198512, + "learning_rate": 0.0001966149091676575, + "loss": 0.7738, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.4795543452934293, + "learning_rate": 0.00019657018398011434, + "loss": 0.7983, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.454044266104084, + "learning_rate": 0.00019652517041934356, + "loss": 0.7982, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.4347171236331447, + "learning_rate": 0.00019647986861976246, + "loss": 0.8419, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.4052379979410376, + "learning_rate": 0.0001964342787166491, + "loss": 0.7631, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.43569229168908397, + "learning_rate": 0.00019638840084614182, + "loss": 0.8285, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.4398135539279044, + "learning_rate": 0.0001963422351452389, + "loss": 0.7413, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.4954383627255014, + "learning_rate": 0.0001962957817517982, + "loss": 0.8204, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5196432413224092, + "learning_rate": 0.00019624904080453655, + "loss": 0.813, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.4998146296917399, + "learning_rate": 0.00019620201244302952, + "loss": 0.7468, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.44380017905542196, + "learning_rate": 0.00019615469680771096, + "loss": 0.759, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4735241465117807, + "learning_rate": 0.00019610709403987246, + "loss": 0.7539, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.42614767575223617, + "learning_rate": 0.00019605920428166323, + "loss": 0.7549, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.4912794884557202, + "learning_rate": 0.00019601102767608923, + "loss": 0.8212, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.40804040740465725, + "learning_rate": 0.00019596256436701324, + "loss": 0.7729, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.5057435393007068, + "learning_rate": 0.00019591381449915397, + "loss": 0.8356, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.4450044987136846, + "learning_rate": 0.00019586477821808597, + "loss": 0.722, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.40539864012144133, + "learning_rate": 0.000195815455670239, + "loss": 0.7341, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.42945105211040513, + "learning_rate": 0.00019576584700289768, + "loss": 0.7402, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.4400399253283012, + "learning_rate": 0.00019571595236420102, + "loss": 0.766, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.47480775379120294, + "learning_rate": 0.00019566577190314197, + "loss": 0.7735, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.48083624247350093, + "learning_rate": 0.00019561530576956703, + "loss": 0.7437, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.4527960167439757, + "learning_rate": 0.00019556455411417573, + "loss": 0.7084, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.46587383560167145, + "learning_rate": 0.0001955135170885202, + "loss": 0.781, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.4359575852677296, + "learning_rate": 0.00019546219484500475, + "loss": 0.7918, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.4285546310251104, + "learning_rate": 0.00019541058753688538, + "loss": 0.8369, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5248001655775915, + "learning_rate": 0.00019535869531826937, + "loss": 0.8085, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.4036128329793708, + "learning_rate": 0.00019530651834411474, + "loss": 0.7202, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.4388416392979749, + "learning_rate": 0.00019525405677022989, + "loss": 0.7236, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.4057369764587263, + "learning_rate": 0.00019520131075327298, + "loss": 0.7492, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.43058107831103193, + "learning_rate": 0.0001951482804507517, + "loss": 0.6945, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.4740706565815538, + "learning_rate": 0.00019509496602102252, + "loss": 0.6996, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.48517105441975683, + "learning_rate": 0.00019504136762329047, + "loss": 0.7643, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.4475993762689496, + "learning_rate": 0.00019498748541760846, + "loss": 0.8288, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.4660880690247606, + "learning_rate": 0.0001949333195648769, + "loss": 0.81, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.506717111232454, + "learning_rate": 0.00019487887022684336, + "loss": 0.8701, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.4676798932942207, + "learning_rate": 0.00019482413756610173, + "loss": 0.7892, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.39802011134696863, + "learning_rate": 0.0001947691217460921, + "loss": 0.7463, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4025814525496451, + "learning_rate": 0.00019471382293110003, + "loss": 0.7596, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.4293380673152308, + "learning_rate": 0.00019465824128625617, + "loss": 0.7349, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.4484093869125715, + "learning_rate": 0.00019460237697753577, + "loss": 0.7896, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5745472609444757, + "learning_rate": 0.00019454623017175812, + "loss": 0.7758, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4511690947612965, + "learning_rate": 0.00019448980103658613, + "loss": 0.785, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.4301389133636024, + "learning_rate": 0.0001944330897405257, + "loss": 0.7118, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5628558109665334, + "learning_rate": 0.00019437609645292546, + "loss": 0.8757, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.44825195379467003, + "learning_rate": 0.00019431882134397598, + "loss": 0.7549, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.4139688294442162, + "learning_rate": 0.00019426126458470936, + "loss": 0.7216, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.4914999085759857, + "learning_rate": 0.0001942034263469989, + "loss": 0.7857, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.45045861846658547, + "learning_rate": 0.00019414530680355837, + "loss": 0.7594, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.54415976239134, + "learning_rate": 0.00019408690612794148, + "loss": 0.8186, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.44770674423486395, + "learning_rate": 0.00019402822449454153, + "loss": 0.7563, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.4332439167961714, + "learning_rate": 0.00019396926207859084, + "loss": 0.7765, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.513080777310441, + "learning_rate": 0.0001939100190561601, + "loss": 0.7722, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.46048250333472324, + "learning_rate": 0.00019385049560415794, + "loss": 0.8457, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.5409763038472102, + "learning_rate": 0.0001937906919003304, + "loss": 0.7467, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.5048634432898285, + "learning_rate": 0.00019373060812326052, + "loss": 0.8176, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.40677339313445815, + "learning_rate": 0.00019367024445236754, + "loss": 0.7599, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.5137234723336886, + "learning_rate": 0.00019360960106790643, + "loss": 0.8256, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.4955039646766089, + "learning_rate": 0.0001935486781509677, + "loss": 0.7996, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4549543044910791, + "learning_rate": 0.00019348747588347637, + "loss": 0.7659, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.514233378986321, + "learning_rate": 0.00019342599444819168, + "loss": 0.8473, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.4197860898310825, + "learning_rate": 0.00019336423402870653, + "loss": 0.7577, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.4655606876532891, + "learning_rate": 0.00019330219480944694, + "loss": 0.7232, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.5219658661940341, + "learning_rate": 0.0001932398769756714, + "loss": 0.7775, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.45920050910419263, + "learning_rate": 0.0001931772807134704, + "loss": 0.7591, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4949167375260932, + "learning_rate": 0.00019311440620976597, + "loss": 0.7895, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.36550310613011944, + "learning_rate": 0.00019305125365231084, + "loss": 0.7346, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.40501845811731474, + "learning_rate": 0.00019298782322968815, + "loss": 0.7659, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4372750459349511, + "learning_rate": 0.0001929241151313108, + "loss": 0.8472, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.4554561174672684, + "learning_rate": 0.0001928601295474208, + "loss": 0.7362, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.49697228865217935, + "learning_rate": 0.00019279586666908884, + "loss": 0.7209, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.4043197887249852, + "learning_rate": 0.00019273132668821364, + "loss": 0.7523, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.48191173115223146, + "learning_rate": 0.00019266650979752136, + "loss": 0.7505, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.47714079201647, + "learning_rate": 0.00019260141619056507, + "loss": 0.6959, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.47070677791037496, + "learning_rate": 0.00019253604606172417, + "loss": 0.7352, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.45554044653552067, + "learning_rate": 0.0001924703996062038, + "loss": 0.7379, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.46584105355548255, + "learning_rate": 0.0001924044770200342, + "loss": 0.7564, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.4314749264263858, + "learning_rate": 0.00019233827850007027, + "loss": 0.7059, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.4695735068859168, + "learning_rate": 0.0001922718042439908, + "loss": 0.7385, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.5126796493845308, + "learning_rate": 0.000192205054450298, + "loss": 0.7944, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.44631848104777794, + "learning_rate": 0.00019213802931831696, + "loss": 0.8395, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.41647207251345697, + "learning_rate": 0.00019207072904819486, + "loss": 0.7578, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.43351509197473287, + "learning_rate": 0.00019200315384090044, + "loss": 0.7547, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4360669876087587, + "learning_rate": 0.00019193530389822363, + "loss": 0.7135, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.47969797535202524, + "learning_rate": 0.00019186717942277462, + "loss": 0.7334, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.5321303633183078, + "learning_rate": 0.00019179878061798347, + "loss": 0.8588, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4226381310659623, + "learning_rate": 0.00019173010768809933, + "loss": 0.7046, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.4321522307672889, + "learning_rate": 0.00019166116083819002, + "loss": 0.74, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.6109275128340287, + "learning_rate": 0.00019159194027414128, + "loss": 0.8307, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.561826732824592, + "learning_rate": 0.0001915224462026563, + "loss": 0.8377, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.46881849298116435, + "learning_rate": 0.00019145267883125482, + "loss": 0.6666, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.37386184177399606, + "learning_rate": 0.00019138263836827288, + "loss": 0.7083, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.4873371584300598, + "learning_rate": 0.00019131232502286188, + "loss": 0.7806, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.49176854968781913, + "learning_rate": 0.00019124173900498818, + "loss": 0.7992, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.4245751108118662, + "learning_rate": 0.00019117088052543233, + "loss": 0.7795, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4913231560990115, + "learning_rate": 0.0001910997497957885, + "loss": 0.8206, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.404512219030091, + "learning_rate": 0.00019102834702846387, + "loss": 0.7035, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.48913247597842824, + "learning_rate": 0.0001909566724366779, + "loss": 0.7747, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.43271391067608156, + "learning_rate": 0.00019088472623446183, + "loss": 0.7488, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.4969527465185882, + "learning_rate": 0.00019081250863665794, + "loss": 0.762, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.42931030111391627, + "learning_rate": 0.0001907400198589189, + "loss": 0.7261, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.5069524485557306, + "learning_rate": 0.00019066726011770726, + "loss": 0.7948, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.4465579286557172, + "learning_rate": 0.00019059422963029464, + "loss": 0.7229, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.40904039416327287, + "learning_rate": 0.0001905209286147611, + "loss": 0.7192, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4808812498972212, + "learning_rate": 0.0001904473572899947, + "loss": 0.7615, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.49575520872474643, + "learning_rate": 0.0001903735158756905, + "loss": 0.7769, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.4463337518123487, + "learning_rate": 0.0001902994045923502, + "loss": 0.7544, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.44638970362973823, + "learning_rate": 0.00019022502366128135, + "loss": 0.7316, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.43747382893287573, + "learning_rate": 0.0001901503733045967, + "loss": 0.7618, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.47841798593223983, + "learning_rate": 0.00019007545374521355, + "loss": 0.8292, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.43636901681178347, + "learning_rate": 0.00019000026520685302, + "loss": 0.7256, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.5180081581480641, + "learning_rate": 0.00018992480791403958, + "loss": 0.7742, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.5253220855220916, + "learning_rate": 0.0001898490820921001, + "loss": 0.7741, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4256667849850581, + "learning_rate": 0.0001897730879671634, + "loss": 0.6888, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.4278391289850968, + "learning_rate": 0.0001896968257661595, + "loss": 0.7634, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.4436933202303663, + "learning_rate": 0.00018962029571681886, + "loss": 0.7468, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.43321787428379244, + "learning_rate": 0.00018954349804767184, + "loss": 0.8549, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.4377508413236027, + "learning_rate": 0.00018946643298804793, + "loss": 0.74, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.3713006997854288, + "learning_rate": 0.00018938910076807513, + "loss": 0.6814, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4443506208855899, + "learning_rate": 0.00018931150161867916, + "loss": 0.7231, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.43526163578696675, + "learning_rate": 0.0001892336357715829, + "loss": 0.7713, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.38246761070996405, + "learning_rate": 0.0001891555034593055, + "loss": 0.7667, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.4314246271817471, + "learning_rate": 0.00018907710491516199, + "loss": 0.7029, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.39923494746519433, + "learning_rate": 0.00018899844037326225, + "loss": 0.7813, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.4186847398364211, + "learning_rate": 0.0001889195100685106, + "loss": 0.7105, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.47557197889034186, + "learning_rate": 0.0001888403142366049, + "loss": 0.7047, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.4132150885926361, + "learning_rate": 0.00018876085311403593, + "loss": 0.716, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.4372379020786047, + "learning_rate": 0.00018868112693808665, + "loss": 0.7624, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4344170860589622, + "learning_rate": 0.00018860113594683148, + "loss": 0.7308, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.4963163084048665, + "learning_rate": 0.00018852088037913577, + "loss": 0.7996, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.4319581176838081, + "learning_rate": 0.0001884403604746547, + "loss": 0.7587, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.40989408741441974, + "learning_rate": 0.00018835957647383303, + "loss": 0.6997, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.41529156262352523, + "learning_rate": 0.00018827852861790398, + "loss": 0.7342, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.43645340390430715, + "learning_rate": 0.00018819721714888877, + "loss": 0.7523, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5428773112788919, + "learning_rate": 0.00018811564230959588, + "loss": 0.8411, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.4257541414345063, + "learning_rate": 0.00018803380434362, + "loss": 0.6975, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.4826240533571775, + "learning_rate": 0.0001879517034953418, + "loss": 0.756, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.48733773081361126, + "learning_rate": 0.00018786934000992688, + "loss": 0.7816, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.4532479494685654, + "learning_rate": 0.00018778671413332513, + "loss": 0.778, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.4262543331458245, + "learning_rate": 0.00018770382611226987, + "loss": 0.7006, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4563004023727352, + "learning_rate": 0.00018762067619427746, + "loss": 0.7583, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.4625188794998478, + "learning_rate": 0.000187537264627646, + "loss": 0.8411, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.5536645615515583, + "learning_rate": 0.00018745359166145523, + "loss": 0.7795, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.43196130501488406, + "learning_rate": 0.00018736965754556528, + "loss": 0.8198, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.42642820835218254, + "learning_rate": 0.00018728546253061614, + "loss": 0.7555, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.4647214261040618, + "learning_rate": 0.00018720100686802694, + "loss": 0.7525, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5125122683502902, + "learning_rate": 0.00018711629080999504, + "loss": 0.8201, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.4330139678839322, + "learning_rate": 0.00018703131460949554, + "loss": 0.7307, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.4824467437038003, + "learning_rate": 0.0001869460785202802, + "loss": 0.8324, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.533861956777296, + "learning_rate": 0.00018686058279687698, + "loss": 0.8308, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.4060688080818584, + "learning_rate": 0.00018677482769458904, + "loss": 0.8042, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.4216719398946027, + "learning_rate": 0.00018668881346949417, + "loss": 0.7504, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.4027708900568809, + "learning_rate": 0.00018660254037844388, + "loss": 0.7287, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.444151307821528, + "learning_rate": 0.00018651600867906272, + "loss": 0.7606, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.471332668323808, + "learning_rate": 0.00018642921862974742, + "loss": 0.782, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.3791440308583429, + "learning_rate": 0.00018634217048966637, + "loss": 0.72, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.49731759646196794, + "learning_rate": 0.00018625486451875843, + "loss": 0.7959, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.46620678123962395, + "learning_rate": 0.0001861673009777325, + "loss": 0.7923, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.41068869026604143, + "learning_rate": 0.0001860794801280666, + "loss": 0.7252, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.3776789165535236, + "learning_rate": 0.00018599140223200716, + "loss": 0.7547, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.5736993325001151, + "learning_rate": 0.0001859030675525681, + "loss": 0.8784, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.41020244284924917, + "learning_rate": 0.0001858144763535302, + "loss": 0.7154, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.5054142932121306, + "learning_rate": 0.0001857256288994402, + "loss": 0.7587, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.39883694234628286, + "learning_rate": 0.00018563652545561013, + "loss": 0.7256, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.43049400761772333, + "learning_rate": 0.0001855471662881164, + "loss": 0.7944, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.4720332451741232, + "learning_rate": 0.000185457551663799, + "loss": 0.8115, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.5270018673238716, + "learning_rate": 0.00018536768185026083, + "loss": 0.7891, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.47994775911430304, + "learning_rate": 0.00018527755711586678, + "loss": 0.771, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.4445426182391953, + "learning_rate": 0.00018518717772974302, + "loss": 0.7426, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.4810788695239798, + "learning_rate": 0.00018509654396177609, + "loss": 0.7136, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.468624310902255, + "learning_rate": 0.00018500565608261214, + "loss": 0.7427, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.4051085347747549, + "learning_rate": 0.00018491451436365627, + "loss": 0.7665, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.4429745586090362, + "learning_rate": 0.0001848231190770714, + "loss": 0.7632, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.4243678167327291, + "learning_rate": 0.00018473147049577774, + "loss": 0.7301, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.4802588045036027, + "learning_rate": 0.00018463956889345194, + "loss": 0.7024, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.44080613228848897, + "learning_rate": 0.00018454741454452603, + "loss": 0.6883, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.6211221715459578, + "learning_rate": 0.00018445500772418697, + "loss": 0.7807, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.42586430306449, + "learning_rate": 0.00018436234870837547, + "loss": 0.75, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.40107312109244403, + "learning_rate": 0.00018426943777378552, + "loss": 0.695, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.442626202174717, + "learning_rate": 0.00018417627519786315, + "loss": 0.7851, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.4184883405956326, + "learning_rate": 0.00018408286125880604, + "loss": 0.7674, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.366713571829327, + "learning_rate": 0.00018398919623556238, + "loss": 0.689, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.4414073292564949, + "learning_rate": 0.00018389528040783012, + "loss": 0.7457, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.3942549677792474, + "learning_rate": 0.0001838011140560562, + "loss": 0.6964, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.41503269097137613, + "learning_rate": 0.00018370669746143564, + "loss": 0.7135, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5245371258298765, + "learning_rate": 0.00018361203090591071, + "loss": 0.7034, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.4594749181909736, + "learning_rate": 0.0001835171146721701, + "loss": 0.7498, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.424235144677964, + "learning_rate": 0.00018342194904364813, + "loss": 0.7235, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.44602704702768614, + "learning_rate": 0.00018332653430452376, + "loss": 0.7556, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.5232426706389673, + "learning_rate": 0.00018323087073971993, + "loss": 0.8369, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.41560340485086095, + "learning_rate": 0.00018313495863490258, + "loss": 0.711, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.3944540115776522, + "learning_rate": 0.00018303879827647975, + "loss": 0.6959, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.4491732511440907, + "learning_rate": 0.00018294238995160094, + "loss": 0.7476, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.3921587418990816, + "learning_rate": 0.00018284573394815597, + "loss": 0.7307, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.486831333616804, + "learning_rate": 0.00018274883055477436, + "loss": 0.7517, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.4010907138133327, + "learning_rate": 0.00018265168006082437, + "loss": 0.6992, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.411977049690829, + "learning_rate": 0.00018255428275641214, + "loss": 0.7785, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.4241612115568198, + "learning_rate": 0.00018245663893238075, + "loss": 0.7824, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.4386435931679396, + "learning_rate": 0.0001823587488803095, + "loss": 0.7548, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.5516298765589959, + "learning_rate": 0.00018226061289251298, + "loss": 0.7903, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5237223995446475, + "learning_rate": 0.00018216223126204007, + "loss": 0.6774, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.408081916675791, + "learning_rate": 0.00018206360428267332, + "loss": 0.8071, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.4140323106783898, + "learning_rate": 0.00018196473224892784, + "loss": 0.7333, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.4109352116539475, + "learning_rate": 0.00018186561545605054, + "loss": 0.7517, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.45203015421066733, + "learning_rate": 0.0001817662542000192, + "loss": 0.75, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.47986137265508316, + "learning_rate": 0.0001816666487775416, + "loss": 0.8174, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.47361437236813453, + "learning_rate": 0.00018156679948605467, + "loss": 0.8038, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.38095316941155105, + "learning_rate": 0.00018146670662372354, + "loss": 0.7278, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.42604931965231435, + "learning_rate": 0.0001813663704894407, + "loss": 0.7002, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.548002801499899, + "learning_rate": 0.00018126579138282503, + "loss": 0.7803, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.5019579540403647, + "learning_rate": 0.00018116496960422107, + "loss": 0.8137, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.46495451500772306, + "learning_rate": 0.00018106390545469795, + "loss": 0.7639, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.42686783185227567, + "learning_rate": 0.0001809625992360485, + "loss": 0.764, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.4779166686139942, + "learning_rate": 0.00018086105125078857, + "loss": 0.7343, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.4977038804757904, + "learning_rate": 0.00018075926180215576, + "loss": 0.7508, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4172342695672868, + "learning_rate": 0.00018065723119410884, + "loss": 0.7518, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.4395016138892947, + "learning_rate": 0.0001805549597313267, + "loss": 0.7764, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.45266053765333597, + "learning_rate": 0.0001804524477192075, + "loss": 0.799, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.481917880967969, + "learning_rate": 0.00018034969546386757, + "loss": 0.7135, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.44519020465293857, + "learning_rate": 0.00018024670327214084, + "loss": 0.8084, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.5670567897416601, + "learning_rate": 0.00018014347145157755, + "loss": 0.7555, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4362383554539751, + "learning_rate": 0.0001800400003104436, + "loss": 0.7766, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.46855866315783895, + "learning_rate": 0.0001799362901577196, + "loss": 0.7916, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.3999572615788725, + "learning_rate": 0.00017983234130309968, + "loss": 0.7275, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4201746754962815, + "learning_rate": 0.00017972815405699103, + "loss": 0.6857, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.4306577908340154, + "learning_rate": 0.00017962372873051252, + "loss": 0.8254, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.454467704188289, + "learning_rate": 0.00017951906563549397, + "loss": 0.6947, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.43617015792846464, + "learning_rate": 0.00017941416508447536, + "loss": 0.6798, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.46513566341592816, + "learning_rate": 0.00017930902739070562, + "loss": 0.7325, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.5209007191479196, + "learning_rate": 0.00017920365286814183, + "loss": 0.7075, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.46186549657234066, + "learning_rate": 0.0001790980418314484, + "loss": 0.7843, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.4225623013491972, + "learning_rate": 0.0001789921945959958, + "loss": 0.7213, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.5545250438765869, + "learning_rate": 0.00017888611147786002, + "loss": 0.7348, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.46159909414157596, + "learning_rate": 0.00017877979279382135, + "loss": 0.7769, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.4608690961702417, + "learning_rate": 0.00017867323886136348, + "loss": 0.7723, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.4836076943876157, + "learning_rate": 0.00017856644999867264, + "loss": 0.7675, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4822012598687289, + "learning_rate": 0.0001784594265246366, + "loss": 0.7657, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.39066832115403344, + "learning_rate": 0.00017835216875884368, + "loss": 0.7076, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.43554059934563555, + "learning_rate": 0.0001782446770215819, + "loss": 0.7195, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.5721257573663748, + "learning_rate": 0.0001781369516338378, + "loss": 0.7613, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.4723085206785278, + "learning_rate": 0.00017802899291729585, + "loss": 0.7581, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.4497789103319603, + "learning_rate": 0.0001779208011943371, + "loss": 0.7162, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.4089031649487846, + "learning_rate": 0.00017781237678803847, + "loss": 0.7948, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.48799499911480854, + "learning_rate": 0.00017770372002217172, + "loss": 0.7798, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.4518393471897176, + "learning_rate": 0.00017759483122120238, + "loss": 0.7332, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4527696046131407, + "learning_rate": 0.000177485710710289, + "loss": 0.7543, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.5013859458368709, + "learning_rate": 0.00017737635881528196, + "loss": 0.7268, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.46679378591767084, + "learning_rate": 0.00017726677586272263, + "loss": 0.7851, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5336235970260841, + "learning_rate": 0.00017715696217984235, + "loss": 0.7894, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.4883559010169363, + "learning_rate": 0.00017704691809456143, + "loss": 0.7159, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.4820215129604109, + "learning_rate": 0.0001769366439354882, + "loss": 0.7079, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.39368616328155626, + "learning_rate": 0.00017682614003191807, + "loss": 0.7186, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.4364515183793448, + "learning_rate": 0.00017671540671383243, + "loss": 0.7017, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.43274092085542504, + "learning_rate": 0.0001766044443118978, + "loss": 0.7559, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.48647218982442203, + "learning_rate": 0.00017649325315746478, + "loss": 0.7824, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.41861598085108487, + "learning_rate": 0.00017638183358256696, + "loss": 0.7443, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.4062758300109946, + "learning_rate": 0.00017627018591992018, + "loss": 0.7695, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.43903042467251013, + "learning_rate": 0.0001761583105029213, + "loss": 0.7342, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.43820479738922974, + "learning_rate": 0.00017604620766564723, + "loss": 0.7055, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.4307447215835221, + "learning_rate": 0.00017593387774285412, + "loss": 0.7517, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4839227716076366, + "learning_rate": 0.00017582132106997616, + "loss": 0.7339, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.41984870190651, + "learning_rate": 0.0001757085379831246, + "loss": 0.7486, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.4801629623764257, + "learning_rate": 0.00017559552881908695, + "loss": 0.7392, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.48309462607886405, + "learning_rate": 0.00017548229391532572, + "loss": 0.7754, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.46557741123122626, + "learning_rate": 0.00017536883360997743, + "loss": 0.733, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.45685923269477885, + "learning_rate": 0.00017525514824185185, + "loss": 0.7812, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4430033811591412, + "learning_rate": 0.00017514123815043074, + "loss": 0.7801, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.44962046020192703, + "learning_rate": 0.00017502710367586687, + "loss": 0.769, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.47108421202389944, + "learning_rate": 0.0001749127451589832, + "loss": 0.8501, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.41875727406366, + "learning_rate": 0.00017479816294127152, + "loss": 0.7784, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.42190855514124825, + "learning_rate": 0.00017468335736489177, + "loss": 0.7209, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.38505377328740736, + "learning_rate": 0.00017456832877267084, + "loss": 0.7375, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.4317908545722597, + "learning_rate": 0.0001744530775081015, + "loss": 0.7465, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.4275197743103051, + "learning_rate": 0.00017433760391534167, + "loss": 0.717, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.42162843207096956, + "learning_rate": 0.00017422190833921283, + "loss": 0.7574, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.42833408855858984, + "learning_rate": 0.0001741059911251997, + "loss": 0.7336, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.45819142099768806, + "learning_rate": 0.00017398985261944856, + "loss": 0.7822, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.3873481884796177, + "learning_rate": 0.00017387349316876666, + "loss": 0.7176, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.40982893189153785, + "learning_rate": 0.000173756913120621, + "loss": 0.7623, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.44379926116672774, + "learning_rate": 0.0001736401128231373, + "loss": 0.7311, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.40282092066670894, + "learning_rate": 0.00017352309262509894, + "loss": 0.7126, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.47431162991111714, + "learning_rate": 0.00017340585287594604, + "loss": 0.6946, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.376782750625978, + "learning_rate": 0.0001732883939257742, + "loss": 0.7209, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.47210312905548446, + "learning_rate": 0.0001731707161253338, + "loss": 0.707, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.46458293227031405, + "learning_rate": 0.0001730528198260285, + "loss": 0.7672, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.4724367145422656, + "learning_rate": 0.00017293470537991463, + "loss": 0.6756, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.4654749494501859, + "learning_rate": 0.00017281637313969978, + "loss": 0.8111, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.3983425392960808, + "learning_rate": 0.00017269782345874203, + "loss": 0.7044, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.4365895782245567, + "learning_rate": 0.00017257905669104874, + "loss": 0.7704, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.3947808943169786, + "learning_rate": 0.00017246007319127545, + "loss": 0.7059, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4128950602547397, + "learning_rate": 0.00017234087331472497, + "loss": 0.7443, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.44306591083777735, + "learning_rate": 0.00017222145741734626, + "loss": 0.7221, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.4007250119444036, + "learning_rate": 0.00017210182585573327, + "loss": 0.6837, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.39105298298237556, + "learning_rate": 0.00017198197898712404, + "loss": 0.7669, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.41542512964154166, + "learning_rate": 0.00017186191716939944, + "loss": 0.7377, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.44365767746746254, + "learning_rate": 0.0001717416407610824, + "loss": 0.7283, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4359421690092708, + "learning_rate": 0.00017162115012133643, + "loss": 0.731, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.3659002525973654, + "learning_rate": 0.00017150044560996488, + "loss": 0.6292, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.38055418483876924, + "learning_rate": 0.00017137952758740978, + "loss": 0.6775, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4944794673497881, + "learning_rate": 0.00017125839641475072, + "loss": 0.7685, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.4472600645121646, + "learning_rate": 0.00017113705245370368, + "loss": 0.7023, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.42356889474137294, + "learning_rate": 0.00017101549606662024, + "loss": 0.6953, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.4167344884222982, + "learning_rate": 0.00017089372761648616, + "loss": 0.7547, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.44603009027545537, + "learning_rate": 0.00017077174746692056, + "loss": 0.7221, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.3768434984379742, + "learning_rate": 0.00017064955598217462, + "loss": 0.6761, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.47799008750919264, + "learning_rate": 0.00017052715352713075, + "loss": 0.7552, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.3875051521127407, + "learning_rate": 0.00017040454046730115, + "loss": 0.6764, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.41168483856721627, + "learning_rate": 0.00017028171716882714, + "loss": 0.7037, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.45857767924392456, + "learning_rate": 0.00017015868399847768, + "loss": 0.6801, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.45600457677670214, + "learning_rate": 0.00017003544132364846, + "loss": 0.7448, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.40262795749401514, + "learning_rate": 0.00016991198951236088, + "loss": 0.7197, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4756476115326436, + "learning_rate": 0.00016978832893326074, + "loss": 0.7534, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.40655573524205474, + "learning_rate": 0.00016966445995561727, + "loss": 0.7545, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.5962571163332399, + "learning_rate": 0.00016954038294932216, + "loss": 0.882, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4236164890146461, + "learning_rate": 0.00016941609828488807, + "loss": 0.7514, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.4191905394956668, + "learning_rate": 0.0001692916063334479, + "loss": 0.6694, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.40519689301537903, + "learning_rate": 0.0001691669074667535, + "loss": 0.7249, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.3992172252324664, + "learning_rate": 0.0001690420020571747, + "loss": 0.7125, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.45261090349463706, + "learning_rate": 0.0001689168904776979, + "loss": 0.7734, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.5631326151894799, + "learning_rate": 0.00016879157310192535, + "loss": 0.7948, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4004930942854674, + "learning_rate": 0.0001686660503040737, + "loss": 0.6992, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.43682380042413677, + "learning_rate": 0.00016854032245897308, + "loss": 0.7585, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.46233555406996935, + "learning_rate": 0.00016841438994206595, + "loss": 0.6966, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4054285615706168, + "learning_rate": 0.00016828825312940592, + "loss": 0.7185, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.3673169325312881, + "learning_rate": 0.00016816191239765667, + "loss": 0.6838, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.5248117352974394, + "learning_rate": 0.00016803536812409075, + "loss": 0.7785, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.39602694225476154, + "learning_rate": 0.0001679086206865886, + "loss": 0.6082, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.5133725626357958, + "learning_rate": 0.00016778167046363734, + "loss": 0.7176, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.44499807570944844, + "learning_rate": 0.00016765451783432953, + "loss": 0.7449, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4630238503557464, + "learning_rate": 0.00016752716317836229, + "loss": 0.7421, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.4854533678419448, + "learning_rate": 0.0001673996068760359, + "loss": 0.7793, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.44544379557468916, + "learning_rate": 0.00016727184930825288, + "loss": 0.7266, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.40447421537567746, + "learning_rate": 0.0001671438908565167, + "loss": 0.7453, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.5173650514310973, + "learning_rate": 0.00016701573190293077, + "loss": 0.6999, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.39231299253018714, + "learning_rate": 0.00016688737283019706, + "loss": 0.6665, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4628975315178123, + "learning_rate": 0.00016675881402161536, + "loss": 0.7475, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.45529339357311305, + "learning_rate": 0.00016663005586108176, + "loss": 0.742, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.42257598124521273, + "learning_rate": 0.00016650109873308765, + "loss": 0.6837, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4717488331588699, + "learning_rate": 0.0001663719430227186, + "loss": 0.6761, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.406874476439242, + "learning_rate": 0.0001662425891156531, + "loss": 0.7217, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.44644268439113843, + "learning_rate": 0.00016611303739816168, + "loss": 0.698, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4885011165806191, + "learning_rate": 0.00016598328825710533, + "loss": 0.7378, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.46157866114898766, + "learning_rate": 0.00016585334207993476, + "loss": 0.8314, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.4806213326080623, + "learning_rate": 0.00016572319925468892, + "loss": 0.7766, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4313784252569985, + "learning_rate": 0.000165592860169994, + "loss": 0.7742, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.4186941403333472, + "learning_rate": 0.0001654623252150624, + "loss": 0.7221, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.4153597720533655, + "learning_rate": 0.00016533159477969122, + "loss": 0.7135, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.4301708135822861, + "learning_rate": 0.00016520066925426144, + "loss": 0.7219, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.38497717673667126, + "learning_rate": 0.00016506954902973655, + "loss": 0.7413, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.4214776209400845, + "learning_rate": 0.00016493823449766136, + "loss": 0.7778, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4485603257497472, + "learning_rate": 0.0001648067260501611, + "loss": 0.7231, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.4467013277600801, + "learning_rate": 0.00016467502407993992, + "loss": 0.6717, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.7248352418893502, + "learning_rate": 0.0001645431289802799, + "loss": 0.8396, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.5316655928965951, + "learning_rate": 0.0001644110411450398, + "loss": 0.7603, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.4191163861171123, + "learning_rate": 0.00016427876096865394, + "loss": 0.7147, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.3901779280880945, + "learning_rate": 0.00016414628884613107, + "loss": 0.6702, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.42692082886530397, + "learning_rate": 0.00016401362517305296, + "loss": 0.7633, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.4657478707478943, + "learning_rate": 0.00016388077034557355, + "loss": 0.7392, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.43968295433399096, + "learning_rate": 0.00016374772476041748, + "loss": 0.7536, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4879853331438654, + "learning_rate": 0.00016361448881487914, + "loss": 0.8114, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.524883168299829, + "learning_rate": 0.00016348106290682118, + "loss": 0.7918, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.4090914095307245, + "learning_rate": 0.00016334744743467364, + "loss": 0.7193, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.43022883747007334, + "learning_rate": 0.00016321364279743266, + "loss": 0.5877, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.4273854651825723, + "learning_rate": 0.00016307964939465914, + "loss": 0.7073, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.49306924109483663, + "learning_rate": 0.00016294546762647775, + "loss": 0.7834, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.44958407187654065, + "learning_rate": 0.0001628110978935756, + "loss": 0.7516, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.41517495755784234, + "learning_rate": 0.0001626765405972011, + "loss": 0.6902, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.3853060345524645, + "learning_rate": 0.00016254179613916278, + "loss": 0.7572, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.40441247254812485, + "learning_rate": 0.00016240686492182804, + "loss": 0.7149, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.5088494997903442, + "learning_rate": 0.000162271747348122, + "loss": 0.7483, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.5522199125021355, + "learning_rate": 0.0001621364438215262, + "loss": 0.846, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.37330826151446855, + "learning_rate": 0.00016200095474607753, + "loss": 0.7074, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.48956139290248746, + "learning_rate": 0.00016186528052636692, + "loss": 0.6854, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.41750812621008515, + "learning_rate": 0.0001617294215675382, + "loss": 0.7007, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5056192033810395, + "learning_rate": 0.00016159337827528685, + "loss": 0.7471, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.403218209604543, + "learning_rate": 0.0001614571510558588, + "loss": 0.7342, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.439841734412921, + "learning_rate": 0.00016132074031604917, + "loss": 0.7172, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.513975233435866, + "learning_rate": 0.0001611841464632011, + "loss": 0.7734, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.5124133462407117, + "learning_rate": 0.00016104736990520468, + "loss": 0.7933, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.45997147328786614, + "learning_rate": 0.0001609104110504954, + "loss": 0.7437, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.47849333008086586, + "learning_rate": 0.0001607732703080532, + "loss": 0.8166, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.44707270851351555, + "learning_rate": 0.00016063594808740113, + "loss": 0.7327, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.37707704242799883, + "learning_rate": 0.00016049844479860422, + "loss": 0.6844, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.5556467813011955, + "learning_rate": 0.00016036076085226814, + "loss": 0.7432, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.4491670034732749, + "learning_rate": 0.00016022289665953808, + "loss": 0.7496, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.45034954515495185, + "learning_rate": 0.00016008485263209742, + "loss": 0.7181, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4245467770127856, + "learning_rate": 0.0001599466291821666, + "loss": 0.7253, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.39504189902573533, + "learning_rate": 0.0001598082267225018, + "loss": 0.7526, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.4750528432017365, + "learning_rate": 0.0001596696456663938, + "loss": 0.8005, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.46842200890521113, + "learning_rate": 0.0001595308864276666, + "loss": 0.7005, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.35903952707599596, + "learning_rate": 0.00015939194942067646, + "loss": 0.6735, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.37995752480766354, + "learning_rate": 0.0001592528350603103, + "loss": 0.6625, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.4476073132273197, + "learning_rate": 0.0001591135437619847, + "loss": 0.7285, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.43193310514746797, + "learning_rate": 0.00015897407594164467, + "loss": 0.7437, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.6244911316558347, + "learning_rate": 0.00015883443201576225, + "loss": 0.7943, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.42134930350758226, + "learning_rate": 0.0001586946124013354, + "loss": 0.7531, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.4813548073126474, + "learning_rate": 0.00015855461751588677, + "loss": 0.7492, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.45462078206354734, + "learning_rate": 0.0001584144477774623, + "loss": 0.7924, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.43853925193295296, + "learning_rate": 0.0001582741036046301, + "loss": 0.7238, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.4369785313615201, + "learning_rate": 0.00015813358541647915, + "loss": 0.7512, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.4294524565445153, + "learning_rate": 0.00015799289363261813, + "loss": 0.7708, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4119283273720428, + "learning_rate": 0.00015785202867317407, + "loss": 0.759, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.4239654817744758, + "learning_rate": 0.00015771099095879108, + "loss": 0.6857, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.41958723907270873, + "learning_rate": 0.0001575697809106292, + "loss": 0.7528, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.42760597413040674, + "learning_rate": 0.00015742839895036305, + "loss": 0.7488, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.41189489453568245, + "learning_rate": 0.00015728684550018064, + "loss": 0.71, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.4325046558582601, + "learning_rate": 0.0001571451209827821, + "loss": 0.6984, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.4689755340313718, + "learning_rate": 0.00015700322582137827, + "loss": 0.7053, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.5179610692091388, + "learning_rate": 0.00015686116043968972, + "loss": 0.743, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.4371161166067628, + "learning_rate": 0.00015671892526194516, + "loss": 0.6869, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.412067856511975, + "learning_rate": 0.0001565765207128805, + "loss": 0.7089, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.48352704083048004, + "learning_rate": 0.0001564339472177373, + "loss": 0.7538, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.4114561121520049, + "learning_rate": 0.00015629120520226165, + "loss": 0.7283, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4129768740986683, + "learning_rate": 0.0001561482950927029, + "loss": 0.7181, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.4058308521722823, + "learning_rate": 0.0001560052173158123, + "loss": 0.7241, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.5313828099613424, + "learning_rate": 0.00015586197229884184, + "loss": 0.7964, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.44339994785709935, + "learning_rate": 0.00015571856046954285, + "loss": 0.7237, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.36296656767708446, + "learning_rate": 0.00015557498225616487, + "loss": 0.6328, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.4200778471709336, + "learning_rate": 0.0001554312380874542, + "loss": 0.662, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.3714091612897101, + "learning_rate": 0.00015528732839265272, + "loss": 0.6658, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.40298315807868135, + "learning_rate": 0.00015514325360149668, + "loss": 0.6665, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.4312775541358898, + "learning_rate": 0.0001549990141442153, + "loss": 0.7136, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.434751665612994, + "learning_rate": 0.0001548546104515294, + "loss": 0.6922, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.4304044437917915, + "learning_rate": 0.00015471004295465035, + "loss": 0.766, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.435059891602515, + "learning_rate": 0.0001545653120852787, + "loss": 0.684, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.4716447319029521, + "learning_rate": 0.00015442041827560274, + "loss": 0.7709, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.46798986066853526, + "learning_rate": 0.00015427536195829742, + "loss": 0.6937, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.398112591291493, + "learning_rate": 0.00015413014356652286, + "loss": 0.6595, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4144261054191747, + "learning_rate": 0.00015398476353392323, + "loss": 0.6714, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.40097132071333075, + "learning_rate": 0.00015383922229462549, + "loss": 0.7164, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.4072626688281069, + "learning_rate": 0.00015369352028323774, + "loss": 0.7137, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.402793601796227, + "learning_rate": 0.00015354765793484834, + "loss": 0.669, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.434833795749613, + "learning_rate": 0.0001534016356850244, + "loss": 0.6841, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.4052645108469939, + "learning_rate": 0.0001532554539698105, + "loss": 0.7611, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.3805800622358866, + "learning_rate": 0.00015310911322572753, + "loss": 0.7112, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.41487367876380865, + "learning_rate": 0.00015296261388977108, + "loss": 0.7288, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.4646233771044, + "learning_rate": 0.0001528159563994104, + "loss": 0.6995, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.3963892013269609, + "learning_rate": 0.000152669141192587, + "loss": 0.6791, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.4184099001892143, + "learning_rate": 0.00015252216870771345, + "loss": 0.6969, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.42542898585695293, + "learning_rate": 0.00015237503938367186, + "loss": 0.7191, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5448829291630773, + "learning_rate": 0.00015222775365981273, + "loss": 0.7122, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.47125879567517487, + "learning_rate": 0.00015208031197595356, + "loss": 0.7616, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.44026883256482013, + "learning_rate": 0.0001519327147723776, + "loss": 0.7366, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.44467255998295074, + "learning_rate": 0.00015178496248983254, + "loss": 0.7329, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.47803310845693225, + "learning_rate": 0.0001516370555695291, + "loss": 0.7534, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.43203894941566023, + "learning_rate": 0.00015148899445313981, + "loss": 0.656, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.43900267263581866, + "learning_rate": 0.00015134077958279765, + "loss": 0.7244, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.454298973382149, + "learning_rate": 0.00015119241140109467, + "loss": 0.7562, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.3802978945649559, + "learning_rate": 0.00015104389035108077, + "loss": 0.7344, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.3945099100516662, + "learning_rate": 0.00015089521687626243, + "loss": 0.7441, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.4475131716099979, + "learning_rate": 0.0001507463914206012, + "loss": 0.7483, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.4642469578161393, + "learning_rate": 0.0001505974144285124, + "loss": 0.6875, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.5176302593343155, + "learning_rate": 0.000150448286344864, + "loss": 0.8067, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.41986391465678924, + "learning_rate": 0.00015029900761497506, + "loss": 0.7475, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.46136378205709, + "learning_rate": 0.00015014957868461458, + "loss": 0.6895, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.5259786387214987, + "learning_rate": 0.00015000000000000001, + "loss": 0.784, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.43018111896826905, + "learning_rate": 0.000149850272007796, + "loss": 0.7328, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.44617404140289296, + "learning_rate": 0.00014970039515511304, + "loss": 0.7792, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.37068315607963076, + "learning_rate": 0.00014955036988950618, + "loss": 0.6809, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.4268403834428016, + "learning_rate": 0.0001494001966589736, + "loss": 0.7044, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.49275362102142706, + "learning_rate": 0.00014924987591195547, + "loss": 0.7278, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4119689333350128, + "learning_rate": 0.00014909940809733222, + "loss": 0.6608, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.48239976045422317, + "learning_rate": 0.0001489487936644237, + "loss": 0.7022, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.444957256161989, + "learning_rate": 0.00014879803306298736, + "loss": 0.6812, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.41397582982901027, + "learning_rate": 0.00014864712674321734, + "loss": 0.7159, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.41256809240312126, + "learning_rate": 0.00014849607515574276, + "loss": 0.6932, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.45527592581249554, + "learning_rate": 0.00014834487875162657, + "loss": 0.7266, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.44174574312687004, + "learning_rate": 0.00014819353798236427, + "loss": 0.7564, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.45677830394976854, + "learning_rate": 0.00014804205329988225, + "loss": 0.7419, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.39187838181442464, + "learning_rate": 0.00014789042515653687, + "loss": 0.7346, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.5075723274296824, + "learning_rate": 0.00014773865400511272, + "loss": 0.8275, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.4658019119180805, + "learning_rate": 0.00014758674029882152, + "loss": 0.6864, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.38307767363207407, + "learning_rate": 0.00014743468449130063, + "loss": 0.7388, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.43750010476245005, + "learning_rate": 0.00014728248703661182, + "loss": 0.7457, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.43833249022891085, + "learning_rate": 0.00014713014838923976, + "loss": 0.7074, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.4314878527261997, + "learning_rate": 0.00014697766900409074, + "loss": 0.7794, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.48154869168406245, + "learning_rate": 0.00014682504933649144, + "loss": 0.7915, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.4327408829170178, + "learning_rate": 0.0001466722898421873, + "loss": 0.7173, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.47887799882065063, + "learning_rate": 0.0001465193909773413, + "loss": 0.8572, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.45107242319571567, + "learning_rate": 0.00014636635319853275, + "loss": 0.7086, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.4629615233997194, + "learning_rate": 0.00014621317696275564, + "loss": 0.6792, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.4239680655581829, + "learning_rate": 0.00014605986272741748, + "loss": 0.7308, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.43004699919410794, + "learning_rate": 0.00014590641095033787, + "loss": 0.7725, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.4434971876446726, + "learning_rate": 0.00014575282208974702, + "loss": 0.7257, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.43702022329417833, + "learning_rate": 0.00014559909660428468, + "loss": 0.7078, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.5165487230674608, + "learning_rate": 0.00014544523495299842, + "loss": 0.753, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.43730629587059877, + "learning_rate": 0.00014529123759534255, + "loss": 0.7266, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.4353705569266859, + "learning_rate": 0.00014513710499117647, + "loss": 0.6965, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.42291324813123504, + "learning_rate": 0.0001449828376007636, + "loss": 0.7302, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.43608599164904627, + "learning_rate": 0.00014482843588476974, + "loss": 0.6879, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.44640764356716217, + "learning_rate": 0.00014467390030426186, + "loss": 0.6458, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.48226250089051803, + "learning_rate": 0.0001445192313207067, + "loss": 0.6977, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.49380259764442747, + "learning_rate": 0.0001443644293959693, + "loss": 0.7087, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.436743758192173, + "learning_rate": 0.00014420949499231172, + "loss": 0.6975, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4334747817473767, + "learning_rate": 0.0001440544285723915, + "loss": 0.7104, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.43374268546338396, + "learning_rate": 0.00014389923059926062, + "loss": 0.7169, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.3925376896201814, + "learning_rate": 0.0001437439015363638, + "loss": 0.6734, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.39449819892584087, + "learning_rate": 0.00014358844184753712, + "loss": 0.6673, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.4509103518457081, + "learning_rate": 0.00014343285199700683, + "loss": 0.6778, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.5082219387763629, + "learning_rate": 0.0001432771324493879, + "loss": 0.7513, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.40216967606345727, + "learning_rate": 0.00014312128366968243, + "loss": 0.7431, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.40969266203255283, + "learning_rate": 0.00014296530612327863, + "loss": 0.7277, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.41888905431993123, + "learning_rate": 0.00014280920027594907, + "loss": 0.6775, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.4093731187807328, + "learning_rate": 0.00014265296659384956, + "loss": 0.739, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.4178473476692447, + "learning_rate": 0.00014249660554351752, + "loss": 0.5942, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.44171104100805797, + "learning_rate": 0.00014234011759187083, + "loss": 0.739, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4252349083840047, + "learning_rate": 0.00014218350320620624, + "loss": 0.6972, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.377200795606724, + "learning_rate": 0.00014202676285419812, + "loss": 0.6597, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.4818691640691401, + "learning_rate": 0.00014186989700389687, + "loss": 0.7594, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.46462827650401956, + "learning_rate": 0.0001417129061237278, + "loss": 0.7223, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.45987811453490407, + "learning_rate": 0.0001415557906824895, + "loss": 0.7314, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.4369841928085185, + "learning_rate": 0.00014139855114935252, + "loss": 0.6808, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.38233064870972155, + "learning_rate": 0.00014124118799385796, + "loss": 0.683, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.46603230782706917, + "learning_rate": 0.0001410837016859161, + "loss": 0.7672, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.41753317232563836, + "learning_rate": 0.00014092609269580496, + "loss": 0.7436, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.3799296803428843, + "learning_rate": 0.00014076836149416887, + "loss": 0.7286, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.47292296820852897, + "learning_rate": 0.00014061050855201723, + "loss": 0.7185, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.3895957480245421, + "learning_rate": 0.0001404525343407228, + "loss": 0.7048, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4066293338823343, + "learning_rate": 0.0001402944393320206, + "loss": 0.7521, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.4387329715532074, + "learning_rate": 0.00014013622399800627, + "loss": 0.7655, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.3931358838546744, + "learning_rate": 0.00013997788881113489, + "loss": 0.6756, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.45033280252042396, + "learning_rate": 0.00013981943424421932, + "loss": 0.662, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.4523685749264847, + "learning_rate": 0.0001396608607704289, + "loss": 0.7158, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.38155286027533103, + "learning_rate": 0.0001395021688632882, + "loss": 0.6829, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.40167710016071684, + "learning_rate": 0.00013934335899667527, + "loss": 0.7151, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.4597203398268425, + "learning_rate": 0.00013918443164482046, + "loss": 0.6922, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.42083485098131285, + "learning_rate": 0.000139025387282305, + "loss": 0.6411, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.3778948458076018, + "learning_rate": 0.00013886622638405952, + "loss": 0.6016, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.4087111865952575, + "learning_rate": 0.0001387069494253626, + "loss": 0.7388, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.4049422496340538, + "learning_rate": 0.0001385475568818394, + "loss": 0.6783, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3992656661882664, + "learning_rate": 0.00013838804922946027, + "loss": 0.7054, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.36378149445487457, + "learning_rate": 0.00013822842694453924, + "loss": 0.6654, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.4882319040471909, + "learning_rate": 0.0001380686905037327, + "loss": 0.7128, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4269943520410961, + "learning_rate": 0.00013790884038403795, + "loss": 0.7005, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.36804409928283105, + "learning_rate": 0.00013774887706279165, + "loss": 0.6559, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.4195207951945363, + "learning_rate": 0.0001375888010176686, + "loss": 0.7012, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.44710522148863835, + "learning_rate": 0.00013742861272668012, + "loss": 0.6932, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.4204747676370217, + "learning_rate": 0.00013726831266817278, + "loss": 0.7129, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.42163735336974956, + "learning_rate": 0.00013710790132082692, + "loss": 0.717, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.4757406531134025, + "learning_rate": 0.00013694737916365517, + "loss": 0.6804, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.4315771333847926, + "learning_rate": 0.00013678674667600102, + "loss": 0.6173, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.43237685072831544, + "learning_rate": 0.00013662600433753745, + "loss": 0.6933, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.4158118707747596, + "learning_rate": 0.00013646515262826552, + "loss": 0.7036, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.4307855486428975, + "learning_rate": 0.00013630419202851284, + "loss": 0.7496, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.429025745211691, + "learning_rate": 0.00013614312301893223, + "loss": 0.7247, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3576468631216106, + "learning_rate": 0.0001359819460805001, + "loss": 0.6742, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.4966019064934107, + "learning_rate": 0.00013582066169451535, + "loss": 0.6822, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.45456433226106024, + "learning_rate": 0.0001356592703425976, + "loss": 0.7619, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.36235700010315813, + "learning_rate": 0.0001354977725066859, + "loss": 0.697, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.40115755508883477, + "learning_rate": 0.00013533616866903735, + "loss": 0.6888, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.39301021367659733, + "learning_rate": 0.0001351744593122255, + "loss": 0.6737, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.37689418356760185, + "learning_rate": 0.00013501264491913906, + "loss": 0.67, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.454763983645677, + "learning_rate": 0.00013485072597298038, + "loss": 0.6682, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.4043439631314288, + "learning_rate": 0.00013468870295726398, + "loss": 0.673, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.44132088073018694, + "learning_rate": 0.0001345265763558152, + "loss": 0.6663, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.4664887460490766, + "learning_rate": 0.00013436434665276865, + "loss": 0.6323, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.5079194071561257, + "learning_rate": 0.00013420201433256689, + "loss": 0.708, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.5136320484094405, + "learning_rate": 0.00013403957987995882, + "loss": 0.7072, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.5002124512143936, + "learning_rate": 0.00013387704377999842, + "loss": 0.7624, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.41959518702885695, + "learning_rate": 0.00013371440651804313, + "loss": 0.6944, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4161606124666796, + "learning_rate": 0.0001335516685797525, + "loss": 0.6911, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.5116685264471817, + "learning_rate": 0.00013338883045108674, + "loss": 0.7093, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.384814734377533, + "learning_rate": 0.00013322589261830517, + "loss": 0.7271, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3820311467510209, + "learning_rate": 0.00013306285556796495, + "loss": 0.6721, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.43601952633834995, + "learning_rate": 0.0001328997197869194, + "loss": 0.7368, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.4331297679878163, + "learning_rate": 0.0001327364857623168, + "loss": 0.7048, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3776924995843434, + "learning_rate": 0.00013257315398159864, + "loss": 0.637, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.3659212952711077, + "learning_rate": 0.00013240972493249847, + "loss": 0.6467, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.4670658459953559, + "learning_rate": 0.0001322461991030402, + "loss": 0.6784, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.46150008468924536, + "learning_rate": 0.00013208257698153677, + "loss": 0.6896, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.3960956573102029, + "learning_rate": 0.00013191885905658872, + "loss": 0.6774, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.4856950320911442, + "learning_rate": 0.0001317550458170826, + "loss": 0.7489, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.41523717872542065, + "learning_rate": 0.00013159113775218964, + "loss": 0.691, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.43311712891722615, + "learning_rate": 0.00013142713535136414, + "loss": 0.7437, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.417547226064576, + "learning_rate": 0.00013126303910434214, + "loss": 0.671, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4302231301735469, + "learning_rate": 0.00013109884950114007, + "loss": 0.7273, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.40642630783593625, + "learning_rate": 0.00013093456703205288, + "loss": 0.798, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.5149084196264678, + "learning_rate": 0.00013077019218765305, + "loss": 0.7864, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.401441193039279, + "learning_rate": 0.00013060572545878875, + "loss": 0.6718, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.39953693235491955, + "learning_rate": 0.0001304411673365826, + "loss": 0.6589, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.4883869855102983, + "learning_rate": 0.0001302765183124302, + "loss": 0.7537, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.43684597712350054, + "learning_rate": 0.00013011177887799845, + "loss": 0.7148, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.37729354638193896, + "learning_rate": 0.00012994694952522435, + "loss": 0.6764, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.4147481636668755, + "learning_rate": 0.00012978203074631334, + "loss": 0.6456, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.47940626886163656, + "learning_rate": 0.00012961702303373795, + "loss": 0.7393, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.39817914406276367, + "learning_rate": 0.00012945192688023624, + "loss": 0.6675, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.5584313468882833, + "learning_rate": 0.0001292867427788104, + "loss": 0.7667, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.4367114712993521, + "learning_rate": 0.00012912147122272523, + "loss": 0.7107, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.36940731022190015, + "learning_rate": 0.00012895611270550666, + "loss": 0.7088, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.4887139556790538, + "learning_rate": 0.0001287906677209403, + "loss": 0.7472, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.5148049545306919, + "learning_rate": 0.00012862513676307008, + "loss": 0.75, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.39045927155655474, + "learning_rate": 0.0001284595203261965, + "loss": 0.6184, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.41338525824422334, + "learning_rate": 0.00012829381890487536, + "loss": 0.6686, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.4580040152660515, + "learning_rate": 0.00012812803299391628, + "loss": 0.7759, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.39197016661405115, + "learning_rate": 0.00012796216308838117, + "loss": 0.6802, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.4060787668598037, + "learning_rate": 0.00012779620968358273, + "loss": 0.6979, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.38749826680060884, + "learning_rate": 0.00012763017327508305, + "loss": 0.6528, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.44255595065983705, + "learning_rate": 0.00012746405435869198, + "loss": 0.7141, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.37420480713147775, + "learning_rate": 0.00012729785343046588, + "loss": 0.6887, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.4178451644372548, + "learning_rate": 0.0001271315709867059, + "loss": 0.7626, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.40716545195879217, + "learning_rate": 0.00012696520752395672, + "loss": 0.7046, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.37260467690368004, + "learning_rate": 0.00012679876353900482, + "loss": 0.6957, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.392410196841189, + "learning_rate": 0.00012663223952887723, + "loss": 0.7572, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.540225482513355, + "learning_rate": 0.00012646563599083996, + "loss": 0.7078, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.4290983314062885, + "learning_rate": 0.00012629895342239643, + "loss": 0.6923, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.45463577400665817, + "learning_rate": 0.00012613219232128608, + "loss": 0.6582, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.42506359020091006, + "learning_rate": 0.00012596535318548289, + "loss": 0.7068, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.4052264833834573, + "learning_rate": 0.0001257984365131938, + "loss": 0.6754, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.42067284781427855, + "learning_rate": 0.00012563144280285741, + "loss": 0.6647, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.3893801590118544, + "learning_rate": 0.00012546437255314222, + "loss": 0.6829, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.40396908137241916, + "learning_rate": 0.0001252972262629454, + "loss": 0.6611, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.41467895442675545, + "learning_rate": 0.00012513000443139112, + "loss": 0.6611, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.4092569145701477, + "learning_rate": 0.00012496270755782914, + "loss": 0.6829, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.3735402941611452, + "learning_rate": 0.00012479533614183334, + "loss": 0.6455, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.49159857340071456, + "learning_rate": 0.00012462789068320017, + "loss": 0.752, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.46477841449404667, + "learning_rate": 0.00012446037168194714, + "loss": 0.691, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.5256131810881728, + "learning_rate": 0.00012429277963831148, + "loss": 0.7473, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.3994308410818447, + "learning_rate": 0.00012412511505274844, + "loss": 0.689, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.38590268814176476, + "learning_rate": 0.00012395737842592995, + "loss": 0.7055, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.39839738685156506, + "learning_rate": 0.000123789570258743, + "loss": 0.6476, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4560486773170303, + "learning_rate": 0.00012362169105228826, + "loss": 0.7591, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.41140782608567117, + "learning_rate": 0.00012345374130787854, + "loss": 0.6936, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.43694387286818914, + "learning_rate": 0.00012328572152703725, + "loss": 0.7166, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.43371584027130167, + "learning_rate": 0.000123117632211497, + "loss": 0.7039, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.42122043137786364, + "learning_rate": 0.00012294947386319794, + "loss": 0.6919, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.43458846282659935, + "learning_rate": 0.0001227812469842864, + "loss": 0.6906, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3964420694030289, + "learning_rate": 0.00012261295207711346, + "loss": 0.6506, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.4522492130358685, + "learning_rate": 0.00012244458964423327, + "loss": 0.6737, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.4414585602518746, + "learning_rate": 0.00012227616018840154, + "loss": 0.8038, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.39697720943345677, + "learning_rate": 0.0001221076642125742, + "loss": 0.7218, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.4639089157109346, + "learning_rate": 0.00012193910221990581, + "loss": 0.7024, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.4252895100417365, + "learning_rate": 0.00012177047471374807, + "loss": 0.7765, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.428348675853124, + "learning_rate": 0.00012160178219764837, + "loss": 0.6838, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.42427234741851616, + "learning_rate": 0.0001214330251753481, + "loss": 0.6729, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.44690930235755943, + "learning_rate": 0.00012126420415078132, + "loss": 0.6534, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.43840045498549246, + "learning_rate": 0.00012109531962807332, + "loss": 0.7117, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.44494519520031595, + "learning_rate": 0.00012092637211153885, + "loss": 0.6616, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.37584479532435616, + "learning_rate": 0.0001207573621056809, + "loss": 0.6595, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.416988876074108, + "learning_rate": 0.00012058829011518896, + "loss": 0.7148, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.448417536573754, + "learning_rate": 0.00012041915664493761, + "loss": 0.6679, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.4112498483095486, + "learning_rate": 0.00012024996219998517, + "loss": 0.7651, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3811467925411851, + "learning_rate": 0.00012008070728557186, + "loss": 0.6415, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.40735345211114443, + "learning_rate": 0.00011991139240711857, + "loss": 0.7183, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.365712004883673, + "learning_rate": 0.00011974201807022525, + "loss": 0.6531, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3399382117782691, + "learning_rate": 0.00011957258478066931, + "loss": 0.6058, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.4379389015780378, + "learning_rate": 0.00011940309304440433, + "loss": 0.7172, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.4369172442435986, + "learning_rate": 0.00011923354336755835, + "loss": 0.7079, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.45164358615309574, + "learning_rate": 0.00011906393625643244, + "loss": 0.7473, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.3659976138113478, + "learning_rate": 0.00011889427221749916, + "loss": 0.722, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.3777699951256542, + "learning_rate": 0.00011872455175740112, + "loss": 0.6876, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5690609248953112, + "learning_rate": 0.00011855477538294935, + "loss": 0.8172, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.5600529431049095, + "learning_rate": 0.00011838494360112185, + "loss": 0.7175, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.38616115760346603, + "learning_rate": 0.00011821505691906216, + "loss": 0.6684, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.37757205127412863, + "learning_rate": 0.00011804511584407763, + "loss": 0.6449, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.4227552376754661, + "learning_rate": 0.00011787512088363817, + "loss": 0.6942, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.37636659984609294, + "learning_rate": 0.00011770507254537453, + "loss": 0.6686, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.42958155859538183, + "learning_rate": 0.00011753497133707679, + "loss": 0.6565, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.4676737883015861, + "learning_rate": 0.00011736481776669306, + "loss": 0.6748, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.4298123565151547, + "learning_rate": 0.00011719461234232764, + "loss": 0.689, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.38712485850856665, + "learning_rate": 0.00011702435557223987, + "loss": 0.6774, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.4538316685132117, + "learning_rate": 0.00011685404796484225, + "loss": 0.7661, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.3902900523149195, + "learning_rate": 0.00011668369002869912, + "loss": 0.7253, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.4549690943672534, + "learning_rate": 0.00011651328227252517, + "loss": 0.7087, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.37214768787751934, + "learning_rate": 0.00011634282520518383, + "loss": 0.6301, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.4197220669283357, + "learning_rate": 0.00011617231933568578, + "loss": 0.6499, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4573007121363443, + "learning_rate": 0.00011600176517318741, + "loss": 0.709, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.3750780611770427, + "learning_rate": 0.00011583116322698935, + "loss": 0.6437, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.42598393273853435, + "learning_rate": 0.00011566051400653486, + "loss": 0.6453, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4043634490015629, + "learning_rate": 0.00011548981802140848, + "loss": 0.7175, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.4100748462912414, + "learning_rate": 0.00011531907578133429, + "loss": 0.7281, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.4083345582958824, + "learning_rate": 0.00011514828779617459, + "loss": 0.6993, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.5386085667416355, + "learning_rate": 0.00011497745457592816, + "loss": 0.7671, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.5047367466998479, + "learning_rate": 0.00011480657663072896, + "loss": 0.7901, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.4082332890498897, + "learning_rate": 0.00011463565447084445, + "loss": 0.7327, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3957954699780059, + "learning_rate": 0.00011446468860667421, + "loss": 0.6939, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.4196723976406624, + "learning_rate": 0.00011429367954874819, + "loss": 0.7432, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.39789366030622275, + "learning_rate": 0.0001141226278077254, + "loss": 0.6599, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.4066334113966666, + "learning_rate": 0.00011395153389439233, + "loss": 0.6858, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.426114928047437, + "learning_rate": 0.00011378039831966134, + "loss": 0.695, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.3657732022256318, + "learning_rate": 0.00011360922159456928, + "loss": 0.6794, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.44820333186437933, + "learning_rate": 0.00011343800423027582, + "loss": 0.6628, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.4191481015202587, + "learning_rate": 0.00011326674673806195, + "loss": 0.7187, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.4211627256751037, + "learning_rate": 0.00011309544962932862, + "loss": 0.7215, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.45212699420089375, + "learning_rate": 0.0001129241134155949, + "loss": 0.7439, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.3838447013917968, + "learning_rate": 0.00011275273860849684, + "loss": 0.6623, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.4618605125531973, + "learning_rate": 0.00011258132571978555, + "loss": 0.699, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.36043551834248494, + "learning_rate": 0.00011240987526132594, + "loss": 0.6604, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.4366509150795268, + "learning_rate": 0.00011223838774509514, + "loss": 0.675, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.37824659917516085, + "learning_rate": 0.00011206686368318086, + "loss": 0.6837, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.38607101421346063, + "learning_rate": 0.00011189530358778005, + "loss": 0.6727, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.4170767756378652, + "learning_rate": 0.00011172370797119712, + "loss": 0.6756, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.4192517271303245, + "learning_rate": 0.00011155207734584263, + "loss": 0.6877, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.3883207568038762, + "learning_rate": 0.00011138041222423177, + "loss": 0.6879, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.43154513623271423, + "learning_rate": 0.00011120871311898254, + "loss": 0.6631, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.41906673225970287, + "learning_rate": 0.0001110369805428146, + "loss": 0.6692, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.42328968585937776, + "learning_rate": 0.00011086521500854745, + "loss": 0.6827, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.4059000813630314, + "learning_rate": 0.0001106934170290991, + "loss": 0.6998, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.4246838021330245, + "learning_rate": 0.00011052158711748434, + "loss": 0.6727, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.41161990655189423, + "learning_rate": 0.00011034972578681338, + "loss": 0.7129, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.3830816169496887, + "learning_rate": 0.00011017783355029026, + "loss": 0.6859, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.3474729385730149, + "learning_rate": 0.00011000591092121127, + "loss": 0.6747, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.4336246830572765, + "learning_rate": 0.00010983395841296348, + "loss": 0.7288, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.4665382299598991, + "learning_rate": 0.0001096619765390232, + "loss": 0.7506, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.368425197903905, + "learning_rate": 0.00010948996581295436, + "loss": 0.6389, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.6781212965291331, + "learning_rate": 0.00010931792674840718, + "loss": 0.835, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.4054103481988646, + "learning_rate": 0.00010914585985911632, + "loss": 0.6339, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.4205111294204396, + "learning_rate": 0.00010897376565889971, + "loss": 0.7192, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.39019261778083597, + "learning_rate": 0.00010880164466165674, + "loss": 0.6808, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.4834038492568575, + "learning_rate": 0.00010862949738136681, + "loss": 0.7897, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.3999334184659683, + "learning_rate": 0.00010845732433208779, + "loss": 0.6131, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3907108326637154, + "learning_rate": 0.00010828512602795462, + "loss": 0.7158, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.36783800739417705, + "learning_rate": 0.00010811290298317755, + "loss": 0.6646, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.38766088148844946, + "learning_rate": 0.00010794065571204072, + "loss": 0.6523, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4088791170143303, + "learning_rate": 0.00010776838472890065, + "loss": 0.6498, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.46091962482195553, + "learning_rate": 0.00010759609054818458, + "loss": 0.7326, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.44779596810688416, + "learning_rate": 0.00010742377368438914, + "loss": 0.6948, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4832443428848463, + "learning_rate": 0.00010725143465207867, + "loss": 0.7584, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.5520058904988282, + "learning_rate": 0.00010707907396588361, + "loss": 0.7798, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.3983116907161026, + "learning_rate": 0.0001069066921404992, + "loss": 0.6778, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.44715659040357375, + "learning_rate": 0.00010673428969068364, + "loss": 0.7322, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.3797034213767869, + "learning_rate": 0.00010656186713125689, + "loss": 0.7212, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.4820125235481935, + "learning_rate": 0.0001063894249770989, + "loss": 0.7215, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.4606848863956204, + "learning_rate": 0.00010621696374314807, + "loss": 0.7216, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.35579528211098854, + "learning_rate": 0.00010604448394439983, + "loss": 0.624, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.41696230084871505, + "learning_rate": 0.00010587198609590505, + "loss": 0.7461, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.41051001450779595, + "learning_rate": 0.00010569947071276847, + "loss": 0.6704, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.4704820800628924, + "learning_rate": 0.00010552693831014726, + "loss": 0.68, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.3643874726929824, + "learning_rate": 0.0001053543894032493, + "loss": 0.6765, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.389512071010762, + "learning_rate": 0.00010518182450733186, + "loss": 0.6597, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.41278035073042324, + "learning_rate": 0.00010500924413769988, + "loss": 0.6714, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.4436583315702858, + "learning_rate": 0.00010483664880970457, + "loss": 0.7382, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4109632758758607, + "learning_rate": 0.00010466403903874176, + "loss": 0.6692, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.430103663623662, + "learning_rate": 0.00010449141534025045, + "loss": 0.6749, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.4466133002880243, + "learning_rate": 0.00010431877822971117, + "loss": 0.6864, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5412950980017701, + "learning_rate": 0.00010414612822264455, + "loss": 0.7182, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.4406866731902667, + "learning_rate": 0.00010397346583460971, + "loss": 0.7105, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.40633350588685896, + "learning_rate": 0.0001038007915812028, + "loss": 0.6471, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.42005885337753446, + "learning_rate": 0.00010362810597805526, + "loss": 0.6788, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.41525024156942714, + "learning_rate": 0.0001034554095408326, + "loss": 0.6532, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.3599678570866186, + "learning_rate": 0.00010328270278523256, + "loss": 0.6282, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4219733547454359, + "learning_rate": 0.0001031099862269837, + "loss": 0.6445, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.3693070442514145, + "learning_rate": 0.00010293726038184393, + "loss": 0.6684, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.40317218230251783, + "learning_rate": 0.00010276452576559879, + "loss": 0.6847, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3667600449620981, + "learning_rate": 0.00010259178289406011, + "loss": 0.6463, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.4575459676412633, + "learning_rate": 0.00010241903228306431, + "loss": 0.7436, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.4143052409106989, + "learning_rate": 0.0001022462744484709, + "loss": 0.6948, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.5551592758193366, + "learning_rate": 0.00010207350990616107, + "loss": 0.8077, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.36552580644611515, + "learning_rate": 0.00010190073917203589, + "loss": 0.6584, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.44283388291390147, + "learning_rate": 0.00010172796276201503, + "loss": 0.6412, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3989293668128008, + "learning_rate": 0.0001015551811920351, + "loss": 0.6487, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.3776659622179366, + "learning_rate": 0.00010138239497804804, + "loss": 0.6177, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.4196106291201286, + "learning_rate": 0.00010120960463601976, + "loss": 0.7107, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.4356598894725207, + "learning_rate": 0.00010103681068192845, + "loss": 0.6155, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.48373090615016834, + "learning_rate": 0.00010086401363176305, + "loss": 0.6167, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.3961007416470711, + "learning_rate": 0.00010069121400152181, + "loss": 0.6412, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.38454925146692004, + "learning_rate": 0.00010051841230721065, + "loss": 0.6678, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.4929824910516295, + "learning_rate": 0.0001003456090648416, + "loss": 0.724, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.4124728750458972, + "learning_rate": 0.00010017280479043147, + "loss": 0.644, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4081739368224594, + "learning_rate": 0.0001, + "loss": 0.6731, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.41193118141534857, + "learning_rate": 9.982719520956855e-05, + "loss": 0.6871, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.40752089452997164, + "learning_rate": 9.965439093515841e-05, + "loss": 0.6802, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.4527288918603455, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6869, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.40886209920259825, + "learning_rate": 9.930878599847821e-05, + "loss": 0.6658, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.3767957878760789, + "learning_rate": 9.913598636823693e-05, + "loss": 0.6953, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3913033172556406, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6215, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.3811126425119513, + "learning_rate": 9.879039536398024e-05, + "loss": 0.6931, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.4046478317760797, + "learning_rate": 9.861760502195197e-05, + "loss": 0.6852, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.3855872010526326, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6597, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.433739458764419, + "learning_rate": 9.827203723798498e-05, + "loss": 0.6829, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.40624916566457947, + "learning_rate": 9.809926082796415e-05, + "loss": 0.7005, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.45049264698191427, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7653, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.3537365765479179, + "learning_rate": 9.775372555152912e-05, + "loss": 0.6309, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.34679338832774076, + "learning_rate": 9.758096771693573e-05, + "loss": 0.5918, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4130211561321415, + "learning_rate": 9.740821710593989e-05, + "loss": 0.671, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.39958354895664644, + "learning_rate": 9.723547423440122e-05, + "loss": 0.6542, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.3933026033310428, + "learning_rate": 9.70627396181561e-05, + "loss": 0.719, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4187629179794375, + "learning_rate": 9.689001377301633e-05, + "loss": 0.665, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.3512871037451522, + "learning_rate": 9.671729721476746e-05, + "loss": 0.5699, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.43032520290837295, + "learning_rate": 9.654459045916743e-05, + "loss": 0.7252, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.42372236008862924, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7716, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.4500511015150906, + "learning_rate": 9.619920841879725e-05, + "loss": 0.702, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.4369864297812008, + "learning_rate": 9.602653416539031e-05, + "loss": 0.7133, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.42738110216480996, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7108, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.4546254298926973, + "learning_rate": 9.568122177028884e-05, + "loss": 0.7021, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.47534555261597616, + "learning_rate": 9.550858465974958e-05, + "loss": 0.7792, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.35009175970476775, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6577, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.4146495775983063, + "learning_rate": 9.516335119029546e-05, + "loss": 0.6847, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.35274495325837385, + "learning_rate": 9.499075586230013e-05, + "loss": 0.626, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3946345924588137, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7274, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.40445474619161476, + "learning_rate": 9.464561059675073e-05, + "loss": 0.6666, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.4420847337796016, + "learning_rate": 9.44730616898528e-05, + "loss": 0.6106, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.5086383552008388, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7165, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.37050778943519386, + "learning_rate": 9.412801390409497e-05, + "loss": 0.6512, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.43013196569499396, + "learning_rate": 9.395551605560018e-05, + "loss": 0.7179, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4578202856490542, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7517, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.4495694920653057, + "learning_rate": 9.361057502290113e-05, + "loss": 0.7634, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.5851523217246388, + "learning_rate": 9.343813286874312e-05, + "loss": 0.735, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.4621421543442429, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6984, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.4168786941520585, + "learning_rate": 9.309330785950086e-05, + "loss": 0.7452, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.40325108999711745, + "learning_rate": 9.292092603411641e-05, + "loss": 0.6807, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3727693370035713, + "learning_rate": 9.274856534792138e-05, + "loss": 0.651, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.44769379418292204, + "learning_rate": 9.257622631561085e-05, + "loss": 0.699, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.5111125142072811, + "learning_rate": 9.240390945181543e-05, + "loss": 0.6924, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.4251550782857545, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7157, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.3769181059561603, + "learning_rate": 9.205934428795929e-05, + "loss": 0.6644, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.45443832424297664, + "learning_rate": 9.188709701682247e-05, + "loss": 0.7169, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4511348245112982, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7126, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.4141737647376624, + "learning_rate": 9.154267566791223e-05, + "loss": 0.633, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.44494223441210634, + "learning_rate": 9.137050261863324e-05, + "loss": 0.6231, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3946276691986154, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6472, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.4421635577070011, + "learning_rate": 9.102623434110028e-05, + "loss": 0.6987, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.4266538788112514, + "learning_rate": 9.085414014088369e-05, + "loss": 0.7181, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.39834575331728844, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6221, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.4717264032877369, + "learning_rate": 9.051003418704565e-05, + "loss": 0.7339, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.43136951491275893, + "learning_rate": 9.033802346097682e-05, + "loss": 0.6747, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.401431272925709, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7016, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.4865990810439014, + "learning_rate": 8.999408907878877e-05, + "loss": 0.6874, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.447148097712422, + "learning_rate": 8.982216644970979e-05, + "loss": 0.6572, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.3543191237955773, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6392, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.3984007462717862, + "learning_rate": 8.947841288251568e-05, + "loss": 0.6716, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.4730393750162832, + "learning_rate": 8.930658297090091e-05, + "loss": 0.6661, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.42607149707485553, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6913, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.4075380632043871, + "learning_rate": 8.896301945718541e-05, + "loss": 0.6374, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.4386141817128692, + "learning_rate": 8.879128688101749e-05, + "loss": 0.6773, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3528487154856506, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6702, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.37987693605543144, + "learning_rate": 8.844792265415738e-05, + "loss": 0.6798, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.434132619873488, + "learning_rate": 8.827629202880293e-05, + "loss": 0.6544, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.4271382707165829, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7142, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.3605931608885955, + "learning_rate": 8.793313631681915e-05, + "loss": 0.6657, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.39329044283106057, + "learning_rate": 8.776161225490489e-05, + "loss": 0.6878, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3995556394988917, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7098, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.4255688945635287, + "learning_rate": 8.741867428021446e-05, + "loss": 0.6487, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.43986765751893153, + "learning_rate": 8.724726139150318e-05, + "loss": 0.6796, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.39980228004220597, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6828, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.3816100710105836, + "learning_rate": 8.690455037067141e-05, + "loss": 0.6751, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.4180911252006521, + "learning_rate": 8.673325326193806e-05, + "loss": 0.6411, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.396115105457432, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6321, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.396360727759017, + "learning_rate": 8.639077840543077e-05, + "loss": 0.7039, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.37612404333459937, + "learning_rate": 8.621960168033867e-05, + "loss": 0.6406, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.3749842980949236, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6159, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.385694310353475, + "learning_rate": 8.587737219227462e-05, + "loss": 0.6532, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.3933784309864641, + "learning_rate": 8.570632045125185e-05, + "loss": 0.6225, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.4003557685321842, + "learning_rate": 8.553531139332582e-05, + "loss": 0.703, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.44846692986705655, + "learning_rate": 8.536434552915556e-05, + "loss": 0.6939, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.351065732506569, + "learning_rate": 8.519342336927105e-05, + "loss": 0.6578, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3840655001604711, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6534, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.4057819927263224, + "learning_rate": 8.485171220382545e-05, + "loss": 0.6888, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.4539176876201716, + "learning_rate": 8.468092421866573e-05, + "loss": 0.6938, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.39433805090457724, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6768, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.36916749602491866, + "learning_rate": 8.433948599346516e-05, + "loss": 0.6298, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.5354961974864721, + "learning_rate": 8.416883677301069e-05, + "loss": 0.7371, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3856750948532204, + "learning_rate": 8.399823482681262e-05, + "loss": 0.621, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.44416444073599765, + "learning_rate": 8.382768066431425e-05, + "loss": 0.6204, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.47608622183479576, + "learning_rate": 8.36571747948162e-05, + "loss": 0.7543, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.47805091710578973, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6586, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.5255071484114089, + "learning_rate": 8.33163099713009e-05, + "loss": 0.7729, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.4375427571144395, + "learning_rate": 8.31459520351578e-05, + "loss": 0.6628, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.4052738746215365, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7356, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.40261689141324786, + "learning_rate": 8.280538765767235e-05, + "loss": 0.6551, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.39752088590860307, + "learning_rate": 8.263518223330697e-05, + "loss": 0.6715, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3692470280595934, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6715, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.4286308729777144, + "learning_rate": 8.22949274546255e-05, + "loss": 0.6297, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.5670954690384703, + "learning_rate": 8.212487911636184e-05, + "loss": 0.7251, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4267321456844729, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6814, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.38861456786538684, + "learning_rate": 8.178494308093789e-05, + "loss": 0.6271, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.38957989693957745, + "learning_rate": 8.161505639887817e-05, + "loss": 0.6861, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4398872258989492, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6615, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.3598987307930638, + "learning_rate": 8.127544824259889e-05, + "loss": 0.6296, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.44545085996388406, + "learning_rate": 8.110572778250085e-05, + "loss": 0.7323, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.4521438135371745, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6646, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.4133136448568273, + "learning_rate": 8.076645663244168e-05, + "loss": 0.7028, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.5112361936765261, + "learning_rate": 8.059690695559568e-05, + "loss": 0.7229, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.3608855051180709, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6265, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.37581199774305873, + "learning_rate": 8.025798192977481e-05, + "loss": 0.641, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.36645220592842015, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6683, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.327679199766857, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6201, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.41461517959529814, + "learning_rate": 7.975003780001485e-05, + "loss": 0.6508, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.4596071811646076, + "learning_rate": 7.958084335506239e-05, + "loss": 0.6781, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.38161433704655506, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6177, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.41597134919896617, + "learning_rate": 7.924263789431912e-05, + "loss": 0.669, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.36479135825180353, + "learning_rate": 7.907362788846116e-05, + "loss": 0.6433, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.40702327606263927, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6458, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.34467219913279784, + "learning_rate": 7.873579584921869e-05, + "loss": 0.6305, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.4073720433870432, + "learning_rate": 7.856697482465196e-05, + "loss": 0.6297, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5012422499598385, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6812, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.4581412119281737, + "learning_rate": 7.822952528625191e-05, + "loss": 0.6534, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.3929723623011824, + "learning_rate": 7.806089778009421e-05, + "loss": 0.692, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.3847513426944313, + "learning_rate": 7.789233578742582e-05, + "loss": 0.651, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.37002841059123026, + "learning_rate": 7.772383981159849e-05, + "loss": 0.6352, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.4035406759186194, + "learning_rate": 7.755541035576677e-05, + "loss": 0.6741, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.47409102297656225, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6994, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.3905904036526821, + "learning_rate": 7.721875301571359e-05, + "loss": 0.6307, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.40843841427179084, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6517, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.5688501524807306, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7094, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.43581669103870135, + "learning_rate": 7.671427847296275e-05, + "loss": 0.7384, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.41014916555280057, + "learning_rate": 7.654625869212146e-05, + "loss": 0.6538, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3725949082075007, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6651, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.36963459874364757, + "learning_rate": 7.6210429741257e-05, + "loss": 0.6866, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.3849175928397143, + "learning_rate": 7.604262157407007e-05, + "loss": 0.6724, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4581545019985593, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6591, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.41913596172880435, + "learning_rate": 7.570722036168854e-05, + "loss": 0.6591, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.4737884387010747, + "learning_rate": 7.55396283180529e-05, + "loss": 0.6145, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.370518586779289, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6183, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.40280941037817003, + "learning_rate": 7.520466385816671e-05, + "loss": 0.6555, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.3646693721554753, + "learning_rate": 7.503729244217086e-05, + "loss": 0.6597, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3663775479204596, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6379, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.4488435817649355, + "learning_rate": 7.470277373705461e-05, + "loss": 0.6758, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.4469665095587836, + "learning_rate": 7.453562744685778e-05, + "loss": 0.6415, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.38464340027405997, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6508, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.45714227358367127, + "learning_rate": 7.42015634868062e-05, + "loss": 0.6618, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.38550320196660004, + "learning_rate": 7.403464681451715e-05, + "loss": 0.6569, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4515411273502641, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6018, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.44006921412224026, + "learning_rate": 7.370104657760361e-05, + "loss": 0.6698, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.3812764903976601, + "learning_rate": 7.353436400916004e-05, + "loss": 0.6439, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4447058234378918, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6575, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.392188011838177, + "learning_rate": 7.320123646099519e-05, + "loss": 0.6589, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.38304190214514855, + "learning_rate": 7.303479247604332e-05, + "loss": 0.6556, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.5405090465021215, + "learning_rate": 7.286842901329412e-05, + "loss": 0.712, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.3543253711580587, + "learning_rate": 7.270214656953415e-05, + "loss": 0.5963, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.5013105388465979, + "learning_rate": 7.253594564130804e-05, + "loss": 0.6463, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.45059783404888487, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6656, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.4330921252061515, + "learning_rate": 7.22037903164173e-05, + "loss": 0.6927, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.6987513883069537, + "learning_rate": 7.203783691161883e-05, + "loss": 0.6964, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.3816638781520124, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6299, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.3751370995542225, + "learning_rate": 7.170618109512465e-05, + "loss": 0.6594, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.4299281410382147, + "learning_rate": 7.154047967380354e-05, + "loss": 0.7386, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.34164488022808387, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6669, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.39787968534787277, + "learning_rate": 7.12093322790597e-05, + "loss": 0.6836, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.39106954459068427, + "learning_rate": 7.104388729449338e-05, + "loss": 0.6351, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3947541795328532, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6774, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.4213082829415851, + "learning_rate": 7.071325722118963e-05, + "loss": 0.642, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.3800845021173878, + "learning_rate": 7.054807311976379e-05, + "loss": 0.6721, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.4071479381839004, + "learning_rate": 7.038297696626206e-05, + "loss": 0.631, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.40252912914646516, + "learning_rate": 7.021796925368667e-05, + "loss": 0.616, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.40052306331925214, + "learning_rate": 7.005305047477566e-05, + "loss": 0.5895, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.467091632151271, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7255, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.4630037079137472, + "learning_rate": 6.972348168756983e-05, + "loss": 0.6549, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.46378430219557376, + "learning_rate": 6.955883266341741e-05, + "loss": 0.7068, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4052886342275947, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6561, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.3999075024423376, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7029, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.5325865529164777, + "learning_rate": 6.906543296794714e-05, + "loss": 0.7503, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.5905952008527307, + "learning_rate": 6.890115049885994e-05, + "loss": 0.674, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.42646157498059994, + "learning_rate": 6.873696089565786e-05, + "loss": 0.626, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.4255902318426631, + "learning_rate": 6.85728646486359e-05, + "loss": 0.6575, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4297025470038566, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6237, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.40347903351671394, + "learning_rate": 6.82449541829174e-05, + "loss": 0.6588, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.3827082474076491, + "learning_rate": 6.80811409434113e-05, + "loss": 0.6632, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.37832045458505154, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6616, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.4111492464118394, + "learning_rate": 6.775380089695986e-05, + "loss": 0.7275, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.37288782144974, + "learning_rate": 6.759027506750158e-05, + "loss": 0.597, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.40002496138954113, + "learning_rate": 6.742684601840141e-05, + "loss": 0.69, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.36710877035634026, + "learning_rate": 6.726351423768322e-05, + "loss": 0.5799, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.39759880488318194, + "learning_rate": 6.710028021308061e-05, + "loss": 0.6172, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4254248194110287, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6473, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.4216400213794304, + "learning_rate": 6.677410738169485e-05, + "loss": 0.6838, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.4060220364568944, + "learning_rate": 6.661116954891328e-05, + "loss": 0.6731, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.42450269698261156, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6783, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.4406882807056827, + "learning_rate": 6.62855934819569e-05, + "loss": 0.6606, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.38146261694436473, + "learning_rate": 6.612295622000162e-05, + "loss": 0.6835, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.3960931033382964, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6869, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.3920942872948423, + "learning_rate": 6.579798566743314e-05, + "loss": 0.65, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.502974696853216, + "learning_rate": 6.563565334723134e-05, + "loss": 0.8013, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.3809516734862799, + "learning_rate": 6.547342364418481e-05, + "loss": 0.6536, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.37018149655501825, + "learning_rate": 6.531129704273604e-05, + "loss": 0.6641, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.4034959080754938, + "learning_rate": 6.514927402701964e-05, + "loss": 0.6341, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4134862147996729, + "learning_rate": 6.498735508086093e-05, + "loss": 0.65, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.3414312601353073, + "learning_rate": 6.48255406877745e-05, + "loss": 0.604, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.3959407841702227, + "learning_rate": 6.466383133096267e-05, + "loss": 0.6536, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4133756851660163, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6885, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.37222803311698316, + "learning_rate": 6.434072965740242e-05, + "loss": 0.6272, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.369139299145551, + "learning_rate": 6.417933830548467e-05, + "loss": 0.5868, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4815835051436008, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7594, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.37513170722998457, + "learning_rate": 6.385687698106781e-05, + "loss": 0.6671, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.44751068701941626, + "learning_rate": 6.369580797148718e-05, + "loss": 0.6568, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3853514665076848, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6967, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.41773830193071393, + "learning_rate": 6.337399566246257e-05, + "loss": 0.7183, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.45915612060049255, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6561, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.41944476778817447, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6754, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.35693564797134475, + "learning_rate": 6.289209867917312e-05, + "loss": 0.6096, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.3560566522051134, + "learning_rate": 6.273168733182722e-05, + "loss": 0.6437, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.5149668677272893, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6776, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.4248207701770222, + "learning_rate": 6.241119898233144e-05, + "loss": 0.6916, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.4732728561650195, + "learning_rate": 6.225112293720836e-05, + "loss": 0.7072, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3574158497559156, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6159, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.3482225731913979, + "learning_rate": 6.19313094962673e-05, + "loss": 0.5944, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.4001535338430282, + "learning_rate": 6.177157305546078e-05, + "loss": 0.6678, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4162071393998801, + "learning_rate": 6.161195077053976e-05, + "loss": 0.5732, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.4703542404400119, + "learning_rate": 6.145244311816063e-05, + "loss": 0.6813, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.5039572272619236, + "learning_rate": 6.129305057463741e-05, + "loss": 0.8289, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.39333888188660837, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6746, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.3737531333883778, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6262, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.3626593145896875, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.6566, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.4379935971854093, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6222, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.37867647437644275, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.5945, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.3730232202108895, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.6218, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.36030794230806007, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6305, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.4183560884240375, + "learning_rate": 6.002211118886514e-05, + "loss": 0.6397, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.444780817450603, + "learning_rate": 5.986377600199371e-05, + "loss": 0.6384, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.45802748310803115, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7336, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.3647774377980463, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.6562, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.36960262362588187, + "learning_rate": 5.938949144798279e-05, + "loss": 0.6779, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.40527322757344364, + "learning_rate": 5.923163850583113e-05, + "loss": 0.63, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.3939893103860662, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6928, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.3826699602637233, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.6269, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3501685236675613, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6623, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.43616076580584456, + "learning_rate": 5.860144885064751e-05, + "loss": 0.7103, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.3726378790672664, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.6285, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.41488579979296436, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6485, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.38689674009819797, + "learning_rate": 5.813010299610313e-05, + "loss": 0.6335, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.437200128890619, + "learning_rate": 5.797323714580192e-05, + "loss": 0.6312, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3919662080702626, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6619, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.3724718876940257, + "learning_rate": 5.765988240812921e-05, + "loss": 0.6356, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 1.7639155606560177, + "learning_rate": 5.750339445648252e-05, + "loss": 0.6326, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.40587213435186836, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6528, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.3922059812193428, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.6448, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.4343148971683679, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.7322, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.46419027714746003, + "learning_rate": 5.687871633031754e-05, + "loss": 0.744, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.4321260594814531, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.6241, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.5739554542647625, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.7038, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.43183095298941515, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6493, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.395184329350968, + "learning_rate": 5.625609846363622e-05, + "loss": 0.6287, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.3732252380750418, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.6378, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.39210240392021667, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.5881, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.3779633989621703, + "learning_rate": 5.579050500768836e-05, + "loss": 0.63, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.42191238495334316, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.6875, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4026515419401466, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6618, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.4192148150372642, + "learning_rate": 5.53260996957381e-05, + "loss": 0.6588, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.4149332064145483, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.646, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4034071104473392, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6219, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.4511181451729584, + "learning_rate": 5.486289500882355e-05, + "loss": 0.6585, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.47643267698354524, + "learning_rate": 5.47087624046575e-05, + "loss": 0.6782, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4086591914425235, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6718, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.4010432208685754, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.6339, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.4626664532030345, + "learning_rate": 5.424717791025302e-05, + "loss": 0.6472, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4308470746273194, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6923, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.42841013030392777, + "learning_rate": 5.394013727258254e-05, + "loss": 0.6747, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.41529556235870374, + "learning_rate": 5.378682303724435e-05, + "loss": 0.6941, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.45007324582289304, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6367, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.44953898222936955, + "learning_rate": 5.348060902265871e-05, + "loss": 0.6463, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.35092620368532895, + "learning_rate": 5.332771015781275e-05, + "loss": 0.6415, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4339567113204943, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7325, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.5796903423136838, + "learning_rate": 5.302233099590928e-05, + "loss": 0.6139, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5001667011104508, + "learning_rate": 5.286985161076029e-05, + "loss": 0.6418, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3820274792926042, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6333, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.3930062869164529, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.7064, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.45246692735783917, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6997, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.40434583944586, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6441, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.38217373263717996, + "learning_rate": 5.210957484346314e-05, + "loss": 0.6356, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.4063080671701538, + "learning_rate": 5.195794670011776e-05, + "loss": 0.6134, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4566062530039861, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7148, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.41542567815953363, + "learning_rate": 5.165512124837344e-05, + "loss": 0.6601, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.3802028666786385, + "learning_rate": 5.150392484425728e-05, + "loss": 0.6613, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.39668663463620063, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6248, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.38452156701095597, + "learning_rate": 5.120196693701267e-05, + "loss": 0.7134, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.3980848074267266, + "learning_rate": 5.105120633557634e-05, + "loss": 0.6716, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.3780272107106022, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6562, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.476362292617112, + "learning_rate": 5.075012408804458e-05, + "loss": 0.7302, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.4234756636372608, + "learning_rate": 5.059980334102637e-05, + "loss": 0.634, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.42949549729642317, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.68, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.3920835689592617, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.6153, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.4368257248483272, + "learning_rate": 5.014972799220403e-05, + "loss": 0.5868, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.4748109414344694, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7149, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.41796503413553826, + "learning_rate": 4.985042131538545e-05, + "loss": 0.6976, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.3895235977869893, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.6573, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5289606490625995, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6889, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.37743217428546827, + "learning_rate": 4.940258557148765e-05, + "loss": 0.6709, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.3947410873715308, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.7213, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.37167156176277555, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6571, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.47132134783591934, + "learning_rate": 4.895610964891923e-05, + "loss": 0.6725, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.44473248869883525, + "learning_rate": 4.880758859890536e-05, + "loss": 0.6765, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.4375571908622867, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7074, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.4399434207020205, + "learning_rate": 4.851100554686021e-05, + "loss": 0.6768, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.4906658379653056, + "learning_rate": 4.836294443047088e-05, + "loss": 0.7046, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.41867234663521896, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6557, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.41536161568550684, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.6557, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.36418655673146955, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6456, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.38810328652348886, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6383, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.37927031129219313, + "learning_rate": 4.762496061632814e-05, + "loss": 0.6243, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.41696650741641733, + "learning_rate": 4.747783129228656e-05, + "loss": 0.6666, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.43900653427477354, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6663, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.387172857897453, + "learning_rate": 4.718404360058966e-05, + "loss": 0.5998, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.35978959618990397, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.6276, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.3630333517324512, + "learning_rate": 4.689088677427249e-05, + "loss": 0.5525, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.3765369610533995, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.629, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.40095821539063065, + "learning_rate": 4.659836431497563e-05, + "loss": 0.6931, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.44801140332437855, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6566, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.4125511746187467, + "learning_rate": 4.630647971676232e-05, + "loss": 0.697, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.37231414111132, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.5672, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.42029924495620385, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.633, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.3883556397443518, + "learning_rate": 4.586985643347717e-05, + "loss": 0.6617, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.436223536266935, + "learning_rate": 4.572463804170263e-05, + "loss": 0.7111, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.4080282150670288, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6967, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.41073688554435384, + "learning_rate": 4.543468791472131e-05, + "loss": 0.6456, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.49566855251674646, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.7341, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4047726805413657, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6305, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.37403704331724497, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.5819, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.3849853682834576, + "learning_rate": 4.485674639850333e-05, + "loss": 0.6712, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.43032730410963155, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6693, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.42595857625470585, + "learning_rate": 4.456876191254582e-05, + "loss": 0.6872, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.4261292946457587, + "learning_rate": 4.442501774383515e-05, + "loss": 0.6343, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.42577749406423065, + "learning_rate": 4.428143953045717e-05, + "loss": 0.654, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.37662528704168124, + "learning_rate": 4.413802770115816e-05, + "loss": 0.6579, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.412943020979185, + "learning_rate": 4.399478268418771e-05, + "loss": 0.5987, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4792157042628448, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6855, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.49828789883725827, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.6583, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.455543554624898, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6929, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.37273936906484545, + "learning_rate": 4.342347928711953e-05, + "loss": 0.5976, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.4221561766618173, + "learning_rate": 4.328107473805487e-05, + "loss": 0.7151, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.4837192353409229, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.6432, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.44031896444546276, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6473, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.46081639364772325, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.7147, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.4450285751670438, + "learning_rate": 4.271315449981934e-05, + "loss": 0.5895, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.45999585667362314, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6837, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.5129723821905416, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.6623, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.4809087567118606, + "learning_rate": 4.228900904120895e-05, + "loss": 0.7028, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3743897890725053, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6569, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.42802344056660824, + "learning_rate": 4.200710636738189e-05, + "loss": 0.6528, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.35795356050626215, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.5789, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3545967010302099, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6488, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.4122734551525557, + "learning_rate": 4.158555222253771e-05, + "loss": 0.6833, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.4470107486644953, + "learning_rate": 4.14453824841132e-05, + "loss": 0.5883, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3843709654838672, + "learning_rate": 4.130538759866457e-05, + "loss": 0.5941, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.3976943559572271, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.6148, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.35580217621603544, + "learning_rate": 4.102592405835536e-05, + "loss": 0.6146, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4082499877029761, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6558, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.3724418218833736, + "learning_rate": 4.074716493968975e-05, + "loss": 0.6596, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.44126564209790003, + "learning_rate": 4.060805057932359e-05, + "loss": 0.6869, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.4731138311593641, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6276, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.3805420640266841, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.6331, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.4195470170035172, + "learning_rate": 4.019177327749822e-05, + "loss": 0.677, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.4505011447716041, + "learning_rate": 4.00533708178334e-05, + "loss": 0.5932, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.39866514042034257, + "learning_rate": 3.991514736790258e-05, + "loss": 0.6426, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.3854196652956052, + "learning_rate": 3.977710334046193e-05, + "loss": 0.6336, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.45061179240724464, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7372, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.4727340269071424, + "learning_rate": 3.950155520139581e-05, + "loss": 0.6015, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.453650302483887, + "learning_rate": 3.936405191259891e-05, + "loss": 0.6064, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4625455283465757, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6724, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.4411413514045393, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.6293, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.4339975054849528, + "learning_rate": 3.895263009479534e-05, + "loss": 0.6984, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.4772981118073982, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6929, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.507646836621118, + "learning_rate": 3.867925968395085e-05, + "loss": 0.6822, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.4153766085596804, + "learning_rate": 3.854284894414122e-05, + "loss": 0.6188, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.3866991784899309, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6159, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.5239299312000721, + "learning_rate": 3.82705784324618e-05, + "loss": 0.647, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.42454726330467013, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.6194, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.41768396922163836, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6889, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.39602687125476754, + "learning_rate": 3.786355617847385e-05, + "loss": 0.6154, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.3749979309905685, + "learning_rate": 3.772825265187802e-05, + "loss": 0.663, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3956799422689981, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6809, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.40275062142513046, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.625, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.40665924811794546, + "learning_rate": 3.732345940279893e-05, + "loss": 0.5881, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.3683174549367088, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6293, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.38720613592766406, + "learning_rate": 3.705453237352227e-05, + "loss": 0.6677, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.5036749622874261, + "learning_rate": 3.692035060534088e-05, + "loss": 0.6574, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4493115634851116, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6573, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.4535192687115761, + "learning_rate": 3.665255256532638e-05, + "loss": 0.5975, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.40060918718377536, + "learning_rate": 3.651893709317887e-05, + "loss": 0.6855, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.3944709192374314, + "learning_rate": 3.638551118512089e-05, + "loss": 0.566, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.3726353302478949, + "learning_rate": 3.625227523958252e-05, + "loss": 0.6728, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.3949773732359607, + "learning_rate": 3.611922965442648e-05, + "loss": 0.6496, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3609421259884968, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6039, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.4445430731854319, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.6484, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.37093530483032444, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.5676, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3667309750893148, + "learning_rate": 3.558895885496023e-05, + "loss": 0.5689, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.4066934128932922, + "learning_rate": 3.545687101972013e-05, + "loss": 0.6381, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.436885499719995, + "learning_rate": 3.53249759200601e-05, + "loss": 0.6583, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.40790686873105114, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6026, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.4355691333133161, + "learning_rate": 3.506176550233863e-05, + "loss": 0.5614, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.38422557716272554, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6302, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4461172465345776, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6151, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.47843635473682655, + "learning_rate": 3.46684052203088e-05, + "loss": 0.6479, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.42331070288726547, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.636, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.41940131364136446, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6257, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.36547908260345435, + "learning_rate": 3.427680074531113e-05, + "loss": 0.5842, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.47445564541342283, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.6197, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.3724715574385032, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6063, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.37114586593610205, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6413, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.4376502146324676, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.6883, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4416327899525981, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6672, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.3946237440320865, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.651, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.40310508460486283, + "learning_rate": 3.336994413891828e-05, + "loss": 0.6521, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.39745506515246737, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6669, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.3829176817076023, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.6258, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.39879227345399526, + "learning_rate": 3.298426809706928e-05, + "loss": 0.5979, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.37884755432785133, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6177, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.3788256924171279, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.6381, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.4748820117521675, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.63, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.3607655066738298, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6382, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.4079651593929963, + "learning_rate": 3.234548216567049e-05, + "loss": 0.6357, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.3825655035108801, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.6376, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3617531326612794, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6141, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.4382082698441567, + "learning_rate": 3.196463187590929e-05, + "loss": 0.6647, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.3632824704187875, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.6035, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.3560365471363562, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6091, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.3838294817984326, + "learning_rate": 3.158561005793402e-05, + "loss": 0.6705, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.3733398564081259, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6039, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.48904916348930316, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6889, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.4073197262263337, + "learning_rate": 3.120842689807468e-05, + "loss": 0.5791, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.3869296702934028, + "learning_rate": 3.108310952230212e-05, + "loss": 0.6186, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.426240402154218, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6561, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.36780682032114076, + "learning_rate": 3.083309253324651e-05, + "loss": 0.6031, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.4272964553458606, + "learning_rate": 3.070839366655215e-05, + "loss": 0.6594, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.5311925819299563, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6507, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.42988680170294935, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.6737, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.3495168859717038, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.5725, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.3753366695725862, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6434, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.5231402347632671, + "learning_rate": 3.008801048763914e-05, + "loss": 0.6249, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.3955684288042401, + "learning_rate": 2.996455867635155e-05, + "loss": 0.6417, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.45152547314678537, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6455, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.4215951935807362, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6326, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.43153923594719235, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.5353, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4259659351664963, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6094, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.4360547293724885, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6261, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.4638142563286862, + "learning_rate": 2.922825253307947e-05, + "loss": 0.6728, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.39738276131172434, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6649, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.37987971995978564, + "learning_rate": 2.898450393337977e-05, + "loss": 0.5753, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.4125297244986955, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.604, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.3981575505947327, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6008, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.3698169667168355, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.5976, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.42819852022057525, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.6912, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4026300079112191, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.5829, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.46967039539446204, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.6183, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.4272125240673037, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.7072, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.5238563483964447, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.7119, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.39904585175727725, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.625, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.35931920466909617, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.6495, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.48450376861339645, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6576, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.4441082592529733, + "learning_rate": 2.753992680872457e-05, + "loss": 0.6697, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.4135004461788684, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6178, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.4427996136818655, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6312, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.3784058040312355, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.6212, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.3865991684709847, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.6506, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.43057174229672973, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6286, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.36209032917868483, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.5779, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.46226915122683315, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.6852, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.40311867497576176, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6374, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.3896750404013572, + "learning_rate": 2.647690737490106e-05, + "loss": 0.6076, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.43408224275159285, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.7034, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4034619127151879, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6662, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.40323793654487333, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.6089, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.4443026691449622, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6625, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.40824863764176456, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.5817, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.3985195292815999, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6344, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.3976172596684671, + "learning_rate": 2.566239608465838e-05, + "loss": 0.6812, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4536497404759498, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7157, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.37512175064821757, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6133, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.45226940070095484, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.6507, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.4047218154179542, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6417, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.4043716939480558, + "learning_rate": 2.508725484101684e-05, + "loss": 0.6298, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.4811202338654842, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.7504, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.40514939933246713, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6279, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.39546233769838046, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.6299, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.41192552360644447, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.6328, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.386669832921117, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6478, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.46866776580758834, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.64, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.37375433442866596, + "learning_rate": 2.429146201687538e-05, + "loss": 0.5574, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.535446435298506, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6927, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.4270250115123898, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6721, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.39523168119792357, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.667, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.43788555689573, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6069, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.3539516179999365, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.6349, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.3928005959926205, + "learning_rate": 2.361816641743303e-05, + "loss": 0.6294, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.5619320827830976, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.625, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.41719664105653065, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6822, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.3721896133840847, + "learning_rate": 2.328459328616759e-05, + "loss": 0.61, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.3982359588916754, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6404, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.37681429170405933, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6142, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.41225871232645167, + "learning_rate": 2.295308190543859e-05, + "loss": 0.6141, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.41223663323819554, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6203, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.41047135281712777, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.6554, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.4509436252773787, + "learning_rate": 2.262364118471805e-05, + "loss": 0.6593, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.39428136142012704, + "learning_rate": 2.251428928971102e-05, + "loss": 0.5994, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.4179779554309077, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.601, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.409533310574979, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.6129, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.41836000821068886, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.5881, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.3879860591336309, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6536, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.38425377493679347, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.6057, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.3759245494228626, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.5599, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.39671834848468107, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.6314, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.48380012072574685, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.6187, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.40956690105938526, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6592, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.4448834724932321, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.649, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.43295392506847596, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.651, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.41656167554386153, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.3846976133129346, + "learning_rate": 2.111388852214001e-05, + "loss": 0.5965, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.35629754003871555, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.6437, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4081664586908809, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6338, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.451342848418995, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.6113, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.3597857961428527, + "learning_rate": 2.069097260929439e-05, + "loss": 0.611, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4459748765452952, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6767, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.37846205035587793, + "learning_rate": 2.048093436450603e-05, + "loss": 0.5782, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.39859055595750215, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.6733, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.4447503145214553, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6725, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.555091204170937, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.8254, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.3952512325592042, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.6195, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4041425848211964, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6251, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.4182819245877915, + "learning_rate": 1.985652854842247e-05, + "loss": 0.5874, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.42370316417755666, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.6149, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4324233810250658, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6546, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.48220088050045584, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.7157, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.4148371009594092, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.5829, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.3853866518903104, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6083, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.37623562198874927, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.612, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.37972097881100897, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.6373, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4554611346192583, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6807, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.40726502755025723, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.619, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.4200742311622154, + "learning_rate": 1.883503039577894e-05, + "loss": 0.6284, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.43408031966201255, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6363, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.3844388790122424, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.5677, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.4253501033138691, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.667, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.7020869956627367, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6088, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.39396732741035073, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.6442, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.3493693979867729, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.548, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.35117778593054183, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.5234, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.36619486228400655, + "learning_rate": 1.803526775107217e-05, + "loss": 0.6023, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.5099186718206056, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.731, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.38852863824832645, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6515, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.8558317095638946, + "learning_rate": 1.773938710748706e-05, + "loss": 0.5826, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.4568966509730956, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.6245, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3905492382856017, + "learning_rate": 1.754336106761927e-05, + "loss": 0.623, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.4484323000541753, + "learning_rate": 1.744571724358789e-05, + "loss": 0.6571, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.4143297902340561, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.6794, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.5571960141540871, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6572, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.4434340121566493, + "learning_rate": 1.715426605184407e-05, + "loss": 0.601, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.4575437450532476, + "learning_rate": 1.705761004839911e-05, + "loss": 0.5841, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.4155530391409412, + "learning_rate": 1.696120172352025e-05, + "loss": 0.5829, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.4867803274492459, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6954, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.4007212557744502, + "learning_rate": 1.676912926028007e-05, + "loss": 0.6377, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.47953864644031985, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6398, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.36216117236352097, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.5955, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.4215038024581817, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.5442, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.3888286315492147, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6086, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.36291020187514095, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.669, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.4127933693889317, + "learning_rate": 1.619888594394382e-05, + "loss": 0.5936, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4612420205644517, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6399, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.37059553028725717, + "learning_rate": 1.601080376443763e-05, + "loss": 0.6532, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.37171843223128626, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.5681, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.38126739055664743, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6638, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.501560182270239, + "learning_rate": 1.573056222621453e-05, + "loss": 0.654, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.35005270978531305, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.6031, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.4579964934854775, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6843, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.4607942867233716, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.7195, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.4311888167516736, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6781, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.39518314956332207, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6296, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.42462918863938204, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6714, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.5099449458583174, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.6888, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4305167865608094, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6933, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.3906628278194183, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.6238, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.3973741892339613, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.6051, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.4251466738972076, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6643, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.43910898280815147, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.5949, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.34915883478379534, + "learning_rate": 1.454244833620102e-05, + "loss": 0.6349, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.5213617644505394, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6377, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.3545430675105045, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.5441, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.40508741535303194, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.624, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3592324997219386, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6798, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.42576466795036494, + "learning_rate": 1.409693244743192e-05, + "loss": 0.6369, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.42515842466851367, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.6619, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3684056355727429, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6439, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.40750521721877575, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.6697, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.4212557915225309, + "learning_rate": 1.37451354812416e-05, + "loss": 0.6729, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.37503390518120405, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.5766, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.6496010140480745, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.5991, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.37552527204325087, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.6557, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3739385674179931, + "learning_rate": 1.339745962155613e-05, + "loss": 0.5871, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.5293550137074311, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6839, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.3677175062078189, + "learning_rate": 1.322517230541096e-05, + "loss": 0.6414, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.548355690374935, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6905, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.46196465082695615, + "learning_rate": 1.30539214797198e-05, + "loss": 0.6747, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.41851828465891927, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.6216, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.40981786976518636, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6417, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.3839440413324502, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.6436, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.385038786223522, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.5943, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.4053712383349885, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6126, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.5242860113328237, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.6922, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.4126345251310983, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6342, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3822975004295326, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6097, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.426063098512969, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.665, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.3834280671572431, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.6156, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.5218622425452143, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6951, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.4311073635345907, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.6073, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.41770082292991967, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.623, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.4788822598582512, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6017, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.45318611123423785, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.5844, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.4666932424200912, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.6738, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.32966958448507333, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.5449, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.42879322785279267, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.6513, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.4321907403217351, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.5743, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4805601721813143, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6635, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.3804008808713323, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.5685, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.4520068873796711, + "learning_rate": 1.123914688596409e-05, + "loss": 0.6906, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3858318101878712, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.5936, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.41950896359501727, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.5963, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.3617807491751094, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.6464, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3338632658928289, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.5682, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.44834261317165985, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.6806, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.38605134729581864, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.639, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.474858212815285, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6611, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.36827023902002454, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.5788, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.3681513757299373, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.6167, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5328726228914815, + "learning_rate": 1.045650195232819e-05, + "loss": 0.5887, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.40482039710821704, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.5998, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.3471325558361177, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.6261, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4708026052451067, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.656, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.48267555668015905, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.6586, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.39935894457820786, + "learning_rate": 1.007519208596045e-05, + "loss": 0.6007, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3707866724890578, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6012, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.4220376448592463, + "learning_rate": 9.924546254786493e-06, + "loss": 0.6754, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.41589827662354534, + "learning_rate": 9.849626695403324e-06, + "loss": 0.6374, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4144330890081341, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6389, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.44259278148951, + "learning_rate": 9.700595407649805e-06, + "loss": 0.6401, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.3668074154530959, + "learning_rate": 9.62648412430951e-06, + "loss": 0.6189, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.42051676368538643, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6059, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.4198663098402038, + "learning_rate": 9.479071385238892e-06, + "loss": 0.6102, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.3887042775608677, + "learning_rate": 9.40577036970538e-06, + "loss": 0.6298, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.41671468252084226, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6175, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.5725914639247383, + "learning_rate": 9.259980141081115e-06, + "loss": 0.7004, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.41570310691335594, + "learning_rate": 9.187491363342093e-06, + "loss": 0.6219, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.37678986724126673, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6325, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.5087701518502271, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6035, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.5968366253849593, + "learning_rate": 8.971652971536148e-06, + "loss": 0.6302, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.41294872983364506, + "learning_rate": 8.900250204211514e-06, + "loss": 0.618, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.40689227115744747, + "learning_rate": 8.829119474567671e-06, + "loss": 0.6187, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.41868893419471087, + "learning_rate": 8.758260995011825e-06, + "loss": 0.58, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.37781440875663735, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6063, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.4231537435489686, + "learning_rate": 8.617361631727138e-06, + "loss": 0.6347, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.4813253066367813, + "learning_rate": 8.547321168745193e-06, + "loss": 0.7026, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.37495130821317624, + "learning_rate": 8.47755379734373e-06, + "loss": 0.5624, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.39158733120979616, + "learning_rate": 8.408059725858719e-06, + "loss": 0.638, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.4240090134696765, + "learning_rate": 8.338839161809997e-06, + "loss": 0.6899, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4494047917779341, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6973, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.38717966988032976, + "learning_rate": 8.201219382016556e-06, + "loss": 0.6946, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.40085836155970156, + "learning_rate": 8.132820577225387e-06, + "loss": 0.5721, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.46835619449527544, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6198, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.44389585227029554, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6503, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.41901331171884865, + "learning_rate": 7.929270951805178e-06, + "loss": 0.6522, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.45930967963327746, + "learning_rate": 7.861970681683051e-06, + "loss": 0.5742, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.4149968259909745, + "learning_rate": 7.794945549701993e-06, + "loss": 0.6598, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.4684133133426712, + "learning_rate": 7.728195756009204e-06, + "loss": 0.6798, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.40471835448358673, + "learning_rate": 7.661721499929753e-06, + "loss": 0.5783, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.39560149646464277, + "learning_rate": 7.595522979965819e-06, + "loss": 0.6177, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.40033366704773343, + "learning_rate": 7.529600393796232e-06, + "loss": 0.6688, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.409847818506166, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6238, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.4150779849093329, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.6336, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.864398242995123, + "learning_rate": 7.333490202478666e-06, + "loss": 0.6764, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.5700955932577777, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6777, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.4018859847440789, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6372, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.5595919704007116, + "learning_rate": 7.1398704525792e-06, + "loss": 0.6671, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.44744266480441175, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6112, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.3344518507688403, + "learning_rate": 7.012176770311862e-06, + "loss": 0.558, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.4113097200970075, + "learning_rate": 6.948746347689183e-06, + "loss": 0.6259, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.37460528555372463, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5764, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.5173789381383239, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.6869, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.38227613922471, + "learning_rate": 6.760123024328624e-06, + "loss": 0.6505, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.38493026301224964, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5685, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.4490857230447103, + "learning_rate": 6.635765971293484e-06, + "loss": 0.6946, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.41304345370614004, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.6859, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.40744245874632135, + "learning_rate": 6.512524116523633e-06, + "loss": 0.588, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.44239687927494337, + "learning_rate": 6.451321849032288e-06, + "loss": 0.5929, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.43987486402033016, + "learning_rate": 6.390398932093555e-06, + "loss": 0.6222, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3721311434397407, + "learning_rate": 6.329755547632499e-06, + "loss": 0.579, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.3904356335284676, + "learning_rate": 6.269391876739495e-06, + "loss": 0.6187, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.4871806474824309, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6944, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.4021699333747135, + "learning_rate": 6.149504395842087e-06, + "loss": 0.607, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.4246269131950553, + "learning_rate": 6.089980943839924e-06, + "loss": 0.6393, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.45995076739188906, + "learning_rate": 6.030737921409169e-06, + "loss": 0.5958, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.49114507115566153, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6515, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.43786087916568106, + "learning_rate": 5.913093872058528e-06, + "loss": 0.683, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.45967511384777227, + "learning_rate": 5.854693196441641e-06, + "loss": 0.638, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.44382110536748526, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6734, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.36098842399148445, + "learning_rate": 5.738735415290642e-06, + "loss": 0.5527, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.4384488298020481, + "learning_rate": 5.681178656024055e-06, + "loss": 0.708, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.4605140379415962, + "learning_rate": 5.623903547074549e-06, + "loss": 0.648, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.45270005595938945, + "learning_rate": 5.566910259474289e-06, + "loss": 0.6898, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.4106034614437745, + "learning_rate": 5.510198963413881e-06, + "loss": 0.6174, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.38222181067866917, + "learning_rate": 5.453769828241872e-06, + "loss": 0.5907, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.37655596401410873, + "learning_rate": 5.397623022464226e-06, + "loss": 0.635, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.4439893541682315, + "learning_rate": 5.341758713743828e-06, + "loss": 0.6316, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.4650227804386738, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6323, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.38012305655100215, + "learning_rate": 5.230878253907912e-06, + "loss": 0.6356, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.43153803190878853, + "learning_rate": 5.175862433898282e-06, + "loss": 0.612, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.46046853955822276, + "learning_rate": 5.121129773156663e-06, + "loss": 0.5875, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.42983289954702175, + "learning_rate": 5.066680435123106e-06, + "loss": 0.6309, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.35262408600859013, + "learning_rate": 5.012514582391592e-06, + "loss": 0.5742, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.43184209643963073, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6638, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.4889204934449891, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6544, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.42236336939209945, + "learning_rate": 4.851719549248301e-06, + "loss": 0.6422, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.4567245542144497, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6735, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.4357239280931403, + "learning_rate": 4.745943229770122e-06, + "loss": 0.6482, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.34538968516681984, + "learning_rate": 4.693481655885257e-06, + "loss": 0.5838, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3746841707569123, + "learning_rate": 4.641304681730641e-06, + "loss": 0.5988, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.4022329001631021, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6291, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.40013953435417127, + "learning_rate": 4.537805154995278e-06, + "loss": 0.5814, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.40005018623772687, + "learning_rate": 4.486482911479839e-06, + "loss": 0.5788, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.35388380651555296, + "learning_rate": 4.435445885824285e-06, + "loss": 0.5536, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.3703089037011794, + "learning_rate": 4.384694230432984e-06, + "loss": 0.5869, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3622268087447349, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.5807, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.396655726300398, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.6126, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.35351844522496834, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.5858, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.37817326714801963, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6063, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.4276057921437013, + "learning_rate": 4.135221781914034e-06, + "loss": 0.6533, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.4328477312558595, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.6599, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.3958361891193586, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6459, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.41189001066763725, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6646, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.3675769278662478, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.5578, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.4985512195488047, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6103, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.4078016854637492, + "learning_rate": 3.845303192289074e-06, + "loss": 0.6205, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.4789924835755939, + "learning_rate": 3.797987556970495e-06, + "loss": 0.6365, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.42676372270114, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6447, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.37019111540882965, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.6, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.4344255348252073, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.6233, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.5098592399994804, + "learning_rate": 3.611599153858214e-06, + "loss": 0.7041, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.40412614594107715, + "learning_rate": 3.565721283350931e-06, + "loss": 0.6506, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.4017842528798377, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.6342, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4186885329532945, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6644, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.39614510568843886, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6043, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.4407424852778707, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.661, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.3947691132853167, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.5734, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.4102618532437786, + "learning_rate": 3.296506110302422e-06, + "loss": 0.5752, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.39763954077320046, + "learning_rate": 3.252646840332918e-06, + "loss": 0.6331, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.38859094017171963, + "learning_rate": 3.209076472645112e-06, + "loss": 0.554, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.4588451672372624, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6015, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.394617664738612, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.6332, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.40402859773708183, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6502, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.3620964096863013, + "learning_rate": 3.037686613916857e-06, + "loss": 0.5847, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.39653047718654283, + "learning_rate": 2.995562691985898e-06, + "loss": 0.5759, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.37820773086511067, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.5757, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.48354202592923184, + "learning_rate": 2.912183982969385e-06, + "loss": 0.633, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.4088213449252369, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.6342, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3785698115850805, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6574, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.3841581688458352, + "learning_rate": 2.789290617426765e-06, + "loss": 0.5831, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.4175965446852087, + "learning_rate": 2.748906571878207e-06, + "loss": 0.6021, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.40051758545421356, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6028, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.37273183672886745, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.577, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.3578309465440957, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.564, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4164954369827976, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6235, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.40703600699695486, + "learning_rate": 2.551344823532964e-06, + "loss": 0.627, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.4308613221237642, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.6092, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.40309993952556944, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6486, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.39045448266587895, + "learning_rate": 2.436298790049363e-06, + "loss": 0.6294, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.3871387157818062, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.6227, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4121871503169258, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6177, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.5563305241966505, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.676, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.44233128189965715, + "learning_rate": 2.286983355164529e-06, + "loss": 0.6635, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.43228959200325306, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6595, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.3882914765123684, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.5887, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.3604084122871091, + "learning_rate": 2.178060137750071e-06, + "loss": 0.5783, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.4923309735087353, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6342, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.38226580095034884, + "learning_rate": 2.106905034576112e-06, + "loss": 0.5738, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.4377425839387685, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.6496, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.3527473642611892, + "learning_rate": 2.036919225091827e-06, + "loss": 0.5983, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.39912448491239905, + "learning_rate": 2.002365067264289e-06, + "loss": 0.5969, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.4226452613966473, + "learning_rate": 1.968103545249611e-06, + "loss": 0.6234, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.42902322612300664, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6761, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.4469457004162467, + "learning_rate": 1.900458817025097e-06, + "loss": 0.572, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.41968848850301177, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.5958, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4088909356738927, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6198, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.3856136564613469, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.5778, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.6538311846681734, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.6943, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.38972142264775966, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6159, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.3913667900168577, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.6314, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.39282375591012886, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.6334, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.4060893225762606, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6276, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.39725728594060705, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.6707, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.45614412437773155, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.636, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.3787903211402147, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6121, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.46964438676140063, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.7232, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.40521348986377664, + "learning_rate": 1.489364501100332e-06, + "loss": 0.6322, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.39599913318282937, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6513, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.3866365521413763, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6067, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.37923608237865436, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.6434, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.37607585850317243, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6123, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.37654613400330933, + "learning_rate": 1.344477780953346e-06, + "loss": 0.613, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.4085526259935621, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.5823, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.37706342506067253, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6105, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.4321633352133741, + "learning_rate": 1.261080262743297e-06, + "loss": 0.64, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.40979503291913427, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.6258, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.35312484737595107, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.618, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.3648822216766174, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.6074, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.3876797565517906, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6507, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4596479724815644, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6629, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.41914541205421035, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.6365, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.4226538276338166, + "learning_rate": 1.076809502472831e-06, + "loss": 0.6159, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.453590112930756, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6396, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.8227867613319757, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6052, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.4032076507160707, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6231, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.36310835104679606, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5699, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.37503508076643055, + "learning_rate": 9.540479264726676e-07, + "loss": 0.6047, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.3558234391460902, + "learning_rate": 9.303826211592315e-07, + "loss": 0.6084, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4579645184505321, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6149, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.37666047242840756, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6185, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.36736878091906583, + "learning_rate": 8.611620049653879e-07, + "loss": 0.633, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.3969085332861335, + "learning_rate": 8.386804624865851e-07, + "loss": 0.5631, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.4769510880191858, + "learning_rate": 8.16495030759501e-07, + "loss": 0.6434, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.41106459360183417, + "learning_rate": 7.946057760332193e-07, + "loss": 0.6728, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.49660782368097145, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6455, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.4628750642052684, + "learning_rate": 7.517160581569372e-07, + "loss": 0.6628, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.39404379341324314, + "learning_rate": 7.307157230821426e-07, + "loss": 0.5824, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4383186870087429, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6194, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.3653631542996674, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6077, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.3957605584777071, + "learning_rate": 6.694935631773258e-07, + "loss": 0.5874, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.4037165979888819, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6066, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.428723502501131, + "learning_rate": 6.301617681886863e-07, + "loss": 0.6354, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.4378736093813807, + "learning_rate": 6.109409416834688e-07, + "loss": 0.5974, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.44772881761449956, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6243, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.3928920983318415, + "learning_rate": 5.733897176325665e-07, + "loss": 0.6209, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.4336639143470741, + "learning_rate": 5.550594322205504e-07, + "loss": 0.6367, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.3277240984708229, + "learning_rate": 5.370261044956971e-07, + "loss": 0.5473, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.3828685362181596, + "learning_rate": 5.192897883082747e-07, + "loss": 0.5927, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.37935820321777786, + "learning_rate": 5.018505366216175e-07, + "loss": 0.6213, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.37713433584045297, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6042, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.39006771178981314, + "learning_rate": 4.678634341683252e-07, + "loss": 0.5797, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.4549279176101698, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.64, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.3759867099326376, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6041, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.44358842570648876, + "learning_rate": 4.191120373120749e-07, + "loss": 0.6691, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.41177907181643036, + "learning_rate": 4.034562351727389e-07, + "loss": 0.6485, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3995106320112122, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6499, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.3675420880857662, + "learning_rate": 3.73036907948543e-07, + "loss": 0.6341, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.621797000673849, + "learning_rate": 3.582734737004101e-07, + "loss": 0.6395, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4097072469260797, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6102, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.5407225785782159, + "learning_rate": 3.296392843612273e-07, + "loss": 0.5909, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.4122705520910875, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.6089, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.38042277285394877, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6294, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.4465751557766768, + "learning_rate": 2.889203328748424e-07, + "loss": 0.5854, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.40270091386558127, + "learning_rate": 2.759428007315212e-07, + "loss": 0.6226, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4270860466442265, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.5886, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.3791287171667277, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.5955, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.41745866531373826, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.6487, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4400123322302061, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6683, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.5849302348906806, + "learning_rate": 2.15522751523467e-07, + "loss": 0.7339, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.40153978522197675, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.5936, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3854755809147383, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6172, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.42008015852413283, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.6483, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.4212138721741976, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.6017, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3832577024291346, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6145, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.3377575889029809, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.5836, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.38250977122866126, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.6037, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.42094045082935005, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6433, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.3820832037132924, + "learning_rate": 1.255414374179531e-07, + "loss": 0.5992, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.500001482850949, + "learning_rate": 1.170343437301491e-07, + "loss": 0.6988, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.5465847295478029, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.694, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.4163535477641629, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.6691, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.37274214909560893, + "learning_rate": 9.330275400666332e-08, + "loss": 0.5738, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.3865735624911239, + "learning_rate": 8.598886661895788e-08, + "loss": 0.5859, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.4870914856922961, + "learning_rate": 7.8973337634336e-08, + "loss": 0.6312, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.42538631972017427, + "learning_rate": 7.225618800222877e-08, + "loss": 0.6686, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3860801731383532, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5808, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.4987207956749768, + "learning_rate": 5.971710613821291e-08, + "loss": 0.6563, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.4078087284343223, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6138, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.3739938231097275, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6631, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.4399246742415485, + "learning_rate": 4.314680098592705e-08, + "loss": 0.6501, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.40071478863608095, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.6179, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.37147629428382106, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6027, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.3879847582609873, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.6125, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.38679631395041086, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.6057, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.6241109452118239, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7422, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.4586768882561857, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.6315, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.37723836829454943, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.5763, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4306229237441863, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6055, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.3864565906226324, + "learning_rate": 9.555535917993297e-09, + "loss": 0.6145, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.45180935071199785, + "learning_rate": 7.315984495548378e-09, + "loss": 0.6469, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.46471876807148316, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6876, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.39990539409713355, + "learning_rate": 3.732667443390181e-09, + "loss": 0.5792, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.42942239779003955, + "learning_rate": 2.388912514017516e-09, + "loss": 0.6408, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.423469642517262, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6451, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.4689776508975757, + "learning_rate": 5.972299119250125e-10, + "loss": 0.5901, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.3714419107868351, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.5707, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.39583113714780643, + "learning_rate": 0.0, + "loss": 0.6271, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1639115726553088.0, + "train_loss": 0.700018780930837, + "train_runtime": 29285.1958, + "train_samples_per_second": 1.024, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1639115726553088.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..af3b994f74c2f13aeddfcfcfc7298a070899de7b --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "gate_proj", + "down_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..61752e8915242cdabe035ad7700cb6f958c6a90b --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:095b0c066a786d9fa7b9743048c71cd2b7ed342cc692815b23c04f41c3e5c31e +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..286e2c2674b240eba973c8041bd4bfe1d8a712f5 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9bce4c81f04f2ebaee51d502e8c15728a3a6e06201db509f429bc66a58a87b2 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..885d58faa403cd028a7ada1286d812793e23835f --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 1.0782078108678725, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.5314, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.0037748871538414, + "learning_rate": 7.017543859649123e-06, + "loss": 1.4457, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 1.1053463923600908, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5674, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 1.1243960613506545, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.514, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 1.00841681692077, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.5181, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9851049857398083, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4734, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.9346731112899275, + "learning_rate": 2.456140350877193e-05, + "loss": 1.41, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 1.0388157530251243, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.3507, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.9032230002036801, + "learning_rate": 3.157894736842105e-05, + "loss": 1.2218, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.7832609647729597, + "learning_rate": 3.508771929824561e-05, + "loss": 1.0542, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.9730372629946202, + "learning_rate": 3.859649122807018e-05, + "loss": 0.9675, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7716758925878686, + "learning_rate": 4.210526315789474e-05, + "loss": 1.0446, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.7351105141336544, + "learning_rate": 4.56140350877193e-05, + "loss": 0.97, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.7590877823875037, + "learning_rate": 4.912280701754386e-05, + "loss": 0.968, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 0.862001055859692, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0147, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.7401084648326463, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.9842, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.7280729375008436, + "learning_rate": 5.9649122807017544e-05, + "loss": 1.0106, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6427810107091592, + "learning_rate": 6.31578947368421e-05, + "loss": 0.8929, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.5577149809059004, + "learning_rate": 6.666666666666667e-05, + "loss": 0.8335, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.5597236992649832, + "learning_rate": 7.017543859649122e-05, + "loss": 0.917, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5856837125576331, + "learning_rate": 7.368421052631579e-05, + "loss": 0.8769, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.5415079697797732, + "learning_rate": 7.719298245614036e-05, + "loss": 0.906, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.5390421862493808, + "learning_rate": 8.070175438596491e-05, + "loss": 0.862, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5936834847094479, + "learning_rate": 8.421052631578948e-05, + "loss": 0.929, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.517546577551544, + "learning_rate": 8.771929824561403e-05, + "loss": 0.8734, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.4692370942023055, + "learning_rate": 9.12280701754386e-05, + "loss": 0.9176, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.6064203180946028, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9189, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.5355115074693406, + "learning_rate": 9.824561403508771e-05, + "loss": 0.9226, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5526680033802652, + "learning_rate": 0.0001017543859649123, + "loss": 0.8585, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.6250427254190083, + "learning_rate": 0.00010526315789473685, + "loss": 0.8898, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.5751008013736765, + "learning_rate": 0.00010877192982456141, + "loss": 0.8642, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.5997096010242022, + "learning_rate": 0.00011228070175438597, + "loss": 0.8783, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5153207577859897, + "learning_rate": 0.00011578947368421053, + "loss": 0.9764, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.4940168304549387, + "learning_rate": 0.00011929824561403509, + "loss": 0.8537, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.6340293239222504, + "learning_rate": 0.00012280701754385965, + "loss": 0.9192, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.4654101629859658, + "learning_rate": 0.0001263157894736842, + "loss": 0.8282, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.47534715474059197, + "learning_rate": 0.0001298245614035088, + "loss": 0.8343, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.5592703237545299, + "learning_rate": 0.00013333333333333334, + "loss": 0.8589, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.4617217902728695, + "learning_rate": 0.0001368421052631579, + "loss": 0.8329, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.46116022384837163, + "learning_rate": 0.00014035087719298245, + "loss": 0.8, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.5590053178933165, + "learning_rate": 0.00014385964912280703, + "loss": 0.9246, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5327015276069621, + "learning_rate": 0.00014736842105263158, + "loss": 0.8644, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.5055057672438585, + "learning_rate": 0.00015087719298245616, + "loss": 0.8465, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.5160236926727955, + "learning_rate": 0.0001543859649122807, + "loss": 0.8897, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.6456337117548154, + "learning_rate": 0.00015789473684210527, + "loss": 0.9126, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.4430961642687092, + "learning_rate": 0.00016140350877192982, + "loss": 0.8395, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.5072626923883533, + "learning_rate": 0.0001649122807017544, + "loss": 0.8827, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5375023711847232, + "learning_rate": 0.00016842105263157895, + "loss": 0.8321, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.47761260989260346, + "learning_rate": 0.00017192982456140353, + "loss": 0.8416, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.4714384959069264, + "learning_rate": 0.00017543859649122806, + "loss": 0.7586, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.4494309002584157, + "learning_rate": 0.00017894736842105264, + "loss": 0.8051, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.448268319282034, + "learning_rate": 0.0001824561403508772, + "loss": 0.903, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.482599187517368, + "learning_rate": 0.00018596491228070177, + "loss": 0.7503, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5341483553733253, + "learning_rate": 0.00018947368421052632, + "loss": 0.8975, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.5902234488941541, + "learning_rate": 0.00019298245614035088, + "loss": 0.8803, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.47711627331975215, + "learning_rate": 0.00019649122807017543, + "loss": 0.8352, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.6494019156360745, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.4657753841679875, + "learning_rate": 0.00019999985069241055, + "loss": 0.7725, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.5012960618867901, + "learning_rate": 0.00019999940277008808, + "loss": 0.8394, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.4698325753034376, + "learning_rate": 0.00019999865623437013, + "loss": 0.8371, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.5131939985752545, + "learning_rate": 0.00019999761108748597, + "loss": 0.7883, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.4475679183229205, + "learning_rate": 0.00019999626733255662, + "loss": 0.8087, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.43235736858508816, + "learning_rate": 0.00019999462497359466, + "loss": 0.7295, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.5090252509945237, + "learning_rate": 0.00019999268401550447, + "loss": 0.8488, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.49769313808738236, + "learning_rate": 0.000199990444464082, + "loss": 0.8864, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.430549326517399, + "learning_rate": 0.00019998790632601496, + "loss": 0.7422, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.5335405696936507, + "learning_rate": 0.00019998506960888256, + "loss": 0.8491, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5616102390859028, + "learning_rate": 0.00019998193432115572, + "loss": 0.8632, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4898557321457483, + "learning_rate": 0.0001999785004721968, + "loss": 0.8596, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.48687077498181897, + "learning_rate": 0.00019997476807225985, + "loss": 0.7753, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.573843156384488, + "learning_rate": 0.0001999707371324904, + "loss": 0.8917, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.44160542816380943, + "learning_rate": 0.00019996640766492543, + "loss": 0.8187, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.4590305279469336, + "learning_rate": 0.00019996177968249334, + "loss": 0.7603, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.44667044744960277, + "learning_rate": 0.0001999568531990141, + "loss": 0.7877, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.49118475163538916, + "learning_rate": 0.00019995162822919883, + "loss": 0.8661, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.5045332172686827, + "learning_rate": 0.00019994610478865011, + "loss": 0.8415, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.4529834789311489, + "learning_rate": 0.0001999402828938618, + "loss": 0.8218, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4420842289599489, + "learning_rate": 0.00019993416256221895, + "loss": 0.7662, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.4793570083240121, + "learning_rate": 0.00019992774381199778, + "loss": 0.8215, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.5547977557934145, + "learning_rate": 0.00019992102666236566, + "loss": 0.8068, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.49307798171651535, + "learning_rate": 0.00019991401113338104, + "loss": 0.7447, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.468758023957036, + "learning_rate": 0.00019990669724599336, + "loss": 0.8129, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.44259774727013146, + "learning_rate": 0.00019989908502204292, + "loss": 0.7862, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5106391511608349, + "learning_rate": 0.00019989117448426108, + "loss": 0.8318, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.6202394187290609, + "learning_rate": 0.00019988296565626987, + "loss": 0.8637, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.5122146608102414, + "learning_rate": 0.00019987445856258206, + "loss": 0.8237, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5148195184994458, + "learning_rate": 0.00019986565322860115, + "loss": 0.9645, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.500459052702555, + "learning_rate": 0.00019985654968062122, + "loss": 0.8149, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.5012533252716662, + "learning_rate": 0.00019984714794582683, + "loss": 0.7844, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.5082868021269187, + "learning_rate": 0.00019983744805229296, + "loss": 0.7612, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.5002359115984162, + "learning_rate": 0.000199827450028985, + "loss": 0.7667, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.46717372027730364, + "learning_rate": 0.00019981715390575858, + "loss": 0.8329, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5464740170191148, + "learning_rate": 0.00019980655971335945, + "loss": 0.8945, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.4216179776785064, + "learning_rate": 0.00019979566748342347, + "loss": 0.7417, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.5242674567300103, + "learning_rate": 0.00019978447724847652, + "loss": 0.8359, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4413129016219234, + "learning_rate": 0.00019977298904193437, + "loss": 0.7834, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.5259889283709757, + "learning_rate": 0.00019976120289810247, + "loss": 0.846, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.4488863126086829, + "learning_rate": 0.00019974911885217608, + "loss": 0.7817, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4919358738924874, + "learning_rate": 0.00019973673694024, + "loss": 0.7426, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.4798760091057027, + "learning_rate": 0.0001997240571992685, + "loss": 0.7894, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.538485971097003, + "learning_rate": 0.00019971107966712518, + "loss": 0.8093, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.513366273404903, + "learning_rate": 0.00019969780438256293, + "loss": 0.7792, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.5030926322111889, + "learning_rate": 0.0001996842313852238, + "loss": 0.8661, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.43326580616893945, + "learning_rate": 0.00019967036071563877, + "loss": 0.7177, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.46267987063497157, + "learning_rate": 0.0001996561924152278, + "loss": 0.774, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.4647459226751621, + "learning_rate": 0.0001996417265262996, + "loss": 0.7847, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.4671650840215938, + "learning_rate": 0.00019962696309205148, + "loss": 0.8229, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5010409060742875, + "learning_rate": 0.0001996119021565693, + "loss": 0.8012, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.43678067885716126, + "learning_rate": 0.0001995965437648273, + "loss": 0.7526, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.4414276904744747, + "learning_rate": 0.00019958088796268793, + "loss": 0.7306, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4703195728087392, + "learning_rate": 0.0001995649347969019, + "loss": 0.8314, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.4140033042661725, + "learning_rate": 0.00019954868431510764, + "loss": 0.7279, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.4992922576120073, + "learning_rate": 0.00019953213656583168, + "loss": 0.8055, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4564238539052949, + "learning_rate": 0.00019951529159848805, + "loss": 0.7914, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.3983832475948493, + "learning_rate": 0.00019949814946337838, + "loss": 0.7176, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.5358145339753235, + "learning_rate": 0.00019948071021169174, + "loss": 0.8084, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.5580701237430883, + "learning_rate": 0.00019946297389550433, + "loss": 0.8086, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.45014351621514576, + "learning_rate": 0.00019944494056777946, + "loss": 0.8038, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.49482662547256834, + "learning_rate": 0.00019942661028236745, + "loss": 0.7771, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.4785941402346086, + "learning_rate": 0.00019940798309400526, + "loss": 0.8812, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.413038663138189, + "learning_rate": 0.00019938905905831654, + "loss": 0.705, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.5068091812336115, + "learning_rate": 0.00019936983823181132, + "loss": 0.8118, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.4044615729153809, + "learning_rate": 0.0001993503206718859, + "loss": 0.754, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.5131866881828823, + "learning_rate": 0.00019933050643682269, + "loss": 0.8209, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.4937709196916679, + "learning_rate": 0.00019931039558578997, + "loss": 0.755, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4905409456203536, + "learning_rate": 0.00019928998817884182, + "loss": 0.7933, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.5575318881989391, + "learning_rate": 0.00019926928427691786, + "loss": 0.7949, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.46528504729449455, + "learning_rate": 0.00019924828394184306, + "loss": 0.8185, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4670060725187256, + "learning_rate": 0.00019922698723632767, + "loss": 0.8065, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.43269193803693456, + "learning_rate": 0.0001992053942239668, + "loss": 0.7703, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.44158426698385295, + "learning_rate": 0.0001991835049692405, + "loss": 0.7997, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.458958112862049, + "learning_rate": 0.00019916131953751342, + "loss": 0.835, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.509152891666129, + "learning_rate": 0.0001991388379950346, + "loss": 0.7686, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.4713020765860328, + "learning_rate": 0.0001991160604089374, + "loss": 0.7525, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.46303360049740766, + "learning_rate": 0.00019909298684723904, + "loss": 0.8002, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.45910893450839446, + "learning_rate": 0.00019906961737884077, + "loss": 0.7494, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.43134847723171826, + "learning_rate": 0.00019904595207352737, + "loss": 0.7798, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.49014505922486473, + "learning_rate": 0.00019902199100196697, + "loss": 0.8493, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.4733554654602289, + "learning_rate": 0.000198997734235711, + "loss": 0.7839, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.47030819786934236, + "learning_rate": 0.00019897318184719385, + "loss": 0.8733, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4629123966641584, + "learning_rate": 0.00019894833390973266, + "loss": 0.7857, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.4314385753341011, + "learning_rate": 0.0001989231904975272, + "loss": 0.7568, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.4401343563472461, + "learning_rate": 0.00019889775168565943, + "loss": 0.7818, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.39948816182307095, + "learning_rate": 0.00019887201755009357, + "loss": 0.739, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.41953302504580503, + "learning_rate": 0.00019884598816767563, + "loss": 0.7946, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.44653089559058606, + "learning_rate": 0.0001988196636161333, + "loss": 0.7732, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.37848478996909823, + "learning_rate": 0.0001987930439740757, + "loss": 0.7567, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.4302699400835671, + "learning_rate": 0.00019876612932099308, + "loss": 0.7567, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.44513898799239343, + "learning_rate": 0.0001987389197372567, + "loss": 0.8458, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.5026683457789477, + "learning_rate": 0.00019871141530411853, + "loss": 0.8723, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.4544036121912563, + "learning_rate": 0.00019868361610371097, + "loss": 0.7982, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.5001248954779373, + "learning_rate": 0.00019865552221904665, + "loss": 0.7937, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5047715395387671, + "learning_rate": 0.0001986271337340182, + "loss": 0.8426, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.4286009962772737, + "learning_rate": 0.00019859845073339787, + "loss": 0.6964, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.5769832815862597, + "learning_rate": 0.00019856947330283752, + "loss": 0.8705, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5174392524864813, + "learning_rate": 0.00019854020152886814, + "loss": 0.8313, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.46360358996447526, + "learning_rate": 0.0001985106354988997, + "loss": 0.7806, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.5041118693355422, + "learning_rate": 0.00019848077530122083, + "loss": 0.8115, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.4263069922493244, + "learning_rate": 0.0001984506210249986, + "loss": 0.8379, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.4196993092812524, + "learning_rate": 0.00019842017276027832, + "loss": 0.7028, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.43315861533948996, + "learning_rate": 0.00019838943059798304, + "loss": 0.7821, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4183118319005412, + "learning_rate": 0.00019835839462991361, + "loss": 0.6924, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.4363924646924392, + "learning_rate": 0.0001983270649487481, + "loss": 0.7162, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.5026222267940708, + "learning_rate": 0.0001982954416480417, + "loss": 0.8423, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.46954716345122977, + "learning_rate": 0.00019826352482222638, + "loss": 0.7307, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.4334353060963593, + "learning_rate": 0.00019823131456661063, + "loss": 0.8122, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.5343974725806075, + "learning_rate": 0.00019819881097737915, + "loss": 0.8015, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5152336777453058, + "learning_rate": 0.00019816601415159263, + "loss": 0.7392, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.44125363525393313, + "learning_rate": 0.00019813292418718732, + "loss": 0.761, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.4352274281590717, + "learning_rate": 0.0001980995411829749, + "loss": 0.7527, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4478477213882576, + "learning_rate": 0.0001980658652386421, + "loss": 0.7522, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.46558101443158806, + "learning_rate": 0.0001980318964547504, + "loss": 0.8325, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.4499716576712317, + "learning_rate": 0.0001979976349327357, + "loss": 0.737, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4556175554147233, + "learning_rate": 0.00019796308077490817, + "loss": 0.7528, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.47783394689592473, + "learning_rate": 0.00019792823408445174, + "loss": 0.8193, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.47807561494885875, + "learning_rate": 0.0001978930949654239, + "loss": 0.8089, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.41756990234426405, + "learning_rate": 0.00019785766352275542, + "loss": 0.7716, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.42731198203876414, + "learning_rate": 0.00019782193986224995, + "loss": 0.8142, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.4847930882369442, + "learning_rate": 0.00019778592409058378, + "loss": 0.8385, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.464915969639284, + "learning_rate": 0.00019774961631530545, + "loss": 0.7324, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.5002738971807956, + "learning_rate": 0.0001977130166448355, + "loss": 0.7015, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.5023550965237535, + "learning_rate": 0.00019767612518846608, + "loss": 0.7307, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.49150226338969616, + "learning_rate": 0.00019763894205636072, + "loss": 0.6957, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.5018907968640427, + "learning_rate": 0.00019760146735955388, + "loss": 0.7841, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.39214524669350853, + "learning_rate": 0.00019756370120995066, + "loss": 0.688, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.42851833998123146, + "learning_rate": 0.00019752564372032657, + "loss": 0.744, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.5115573107106643, + "learning_rate": 0.000197487295004327, + "loss": 0.7902, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.5031210824511215, + "learning_rate": 0.00019744865517646706, + "loss": 0.8121, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4422611027768926, + "learning_rate": 0.00019740972435213115, + "loss": 0.7694, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.40095867948126285, + "learning_rate": 0.0001973705026475726, + "loss": 0.7285, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.42020494177988665, + "learning_rate": 0.00019733099017991341, + "loss": 0.717, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.43899302381157285, + "learning_rate": 0.00019729118706714375, + "loss": 0.8096, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.4055049264781474, + "learning_rate": 0.0001972510934281218, + "loss": 0.7496, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.4409046256864588, + "learning_rate": 0.00019721070938257324, + "loss": 0.8141, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.46769922457624513, + "learning_rate": 0.00019717003505109095, + "loss": 0.775, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.4693161824007968, + "learning_rate": 0.0001971290705551347, + "loss": 0.7576, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.4791426220542533, + "learning_rate": 0.00019708781601703065, + "loss": 0.7763, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.422635449428891, + "learning_rate": 0.00019704627155997108, + "loss": 0.8065, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.43519477212457386, + "learning_rate": 0.00019700443730801413, + "loss": 0.731, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.5021449909182034, + "learning_rate": 0.00019696231338608316, + "loss": 0.8571, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.49111009786941157, + "learning_rate": 0.00019691989991996663, + "loss": 0.8424, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.5004754574494609, + "learning_rate": 0.00019687719703631755, + "loss": 0.7619, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.47862670141886915, + "learning_rate": 0.00019683420486265327, + "loss": 0.7789, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.400535834468971, + "learning_rate": 0.0001967909235273549, + "loss": 0.8097, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.3965503578402775, + "learning_rate": 0.0001967473531596671, + "loss": 0.7492, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.46438594527621285, + "learning_rate": 0.0001967034938896976, + "loss": 0.7531, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.49645821863670947, + "learning_rate": 0.00019665934584841682, + "loss": 0.7476, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.4797602743957526, + "learning_rate": 0.0001966149091676575, + "loss": 0.8003, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.48620645881858454, + "learning_rate": 0.00019657018398011434, + "loss": 0.7763, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.48710455717037493, + "learning_rate": 0.00019652517041934356, + "loss": 0.7308, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.488739719891276, + "learning_rate": 0.00019647986861976246, + "loss": 0.8456, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.4151643748880333, + "learning_rate": 0.0001964342787166491, + "loss": 0.6726, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5615694070241564, + "learning_rate": 0.00019638840084614182, + "loss": 0.7971, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.4365307775047827, + "learning_rate": 0.0001963422351452389, + "loss": 0.7632, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.4822219117865235, + "learning_rate": 0.0001962957817517982, + "loss": 0.7515, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5058365326021625, + "learning_rate": 0.00019624904080453655, + "loss": 0.7915, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.44524691870655586, + "learning_rate": 0.00019620201244302952, + "loss": 0.7229, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.47184407725185074, + "learning_rate": 0.00019615469680771096, + "loss": 0.7908, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.52321985602367, + "learning_rate": 0.00019610709403987246, + "loss": 0.7475, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.4259709526108592, + "learning_rate": 0.00019605920428166323, + "loss": 0.7393, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.42008032226899766, + "learning_rate": 0.00019601102767608923, + "loss": 0.767, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5159087518954155, + "learning_rate": 0.00019596256436701324, + "loss": 0.8773, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.4026844295780718, + "learning_rate": 0.00019591381449915397, + "loss": 0.7382, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.457361121265748, + "learning_rate": 0.00019586477821808597, + "loss": 0.8101, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.39846496644139395, + "learning_rate": 0.000195815455670239, + "loss": 0.7036, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.4633248517816551, + "learning_rate": 0.00019576584700289768, + "loss": 0.8288, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.46349626775271696, + "learning_rate": 0.00019571595236420102, + "loss": 0.8263, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.46354979074211466, + "learning_rate": 0.00019566577190314197, + "loss": 0.8229, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.4622245498790009, + "learning_rate": 0.00019561530576956703, + "loss": 0.782, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.4201225638647903, + "learning_rate": 0.00019556455411417573, + "loss": 0.7595, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.45240939219601756, + "learning_rate": 0.0001955135170885202, + "loss": 0.7586, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.4229103664112331, + "learning_rate": 0.00019546219484500475, + "loss": 0.7522, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.47614492773008826, + "learning_rate": 0.00019541058753688538, + "loss": 0.8181, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5594293747726962, + "learning_rate": 0.00019535869531826937, + "loss": 0.863, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.47959040417664484, + "learning_rate": 0.00019530651834411474, + "loss": 0.8326, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.44611393404366323, + "learning_rate": 0.00019525405677022989, + "loss": 0.7456, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.41646709788210534, + "learning_rate": 0.00019520131075327298, + "loss": 0.7606, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.49413052721639045, + "learning_rate": 0.0001951482804507517, + "loss": 0.8225, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.43207494172455535, + "learning_rate": 0.00019509496602102252, + "loss": 0.6985, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.4685530071491613, + "learning_rate": 0.00019504136762329047, + "loss": 0.8728, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.4748529746658996, + "learning_rate": 0.00019498748541760846, + "loss": 0.765, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.5153684120119952, + "learning_rate": 0.0001949333195648769, + "loss": 0.7984, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4687234921012601, + "learning_rate": 0.00019487887022684336, + "loss": 0.7725, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.49591514964277283, + "learning_rate": 0.00019482413756610173, + "loss": 0.7857, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.46511631376724744, + "learning_rate": 0.0001947691217460921, + "loss": 0.7754, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.41817559631604106, + "learning_rate": 0.00019471382293110003, + "loss": 0.7059, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.42292853515667883, + "learning_rate": 0.00019465824128625617, + "loss": 0.7317, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.4555647309671736, + "learning_rate": 0.00019460237697753577, + "loss": 0.7881, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4106971317302793, + "learning_rate": 0.00019454623017175812, + "loss": 0.748, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4151948013402095, + "learning_rate": 0.00019448980103658613, + "loss": 0.7256, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.461900745415348, + "learning_rate": 0.0001944330897405257, + "loss": 0.7443, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5106190880012906, + "learning_rate": 0.00019437609645292546, + "loss": 0.7918, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.4449232128613463, + "learning_rate": 0.00019431882134397598, + "loss": 0.7788, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.46317995146668955, + "learning_rate": 0.00019426126458470936, + "loss": 0.8401, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.5709331162036041, + "learning_rate": 0.0001942034263469989, + "loss": 0.7444, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.4111423556450833, + "learning_rate": 0.00019414530680355837, + "loss": 0.7836, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.416597649490344, + "learning_rate": 0.00019408690612794148, + "loss": 0.7613, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4437037245893358, + "learning_rate": 0.00019402822449454153, + "loss": 0.7686, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.4804498085842282, + "learning_rate": 0.00019396926207859084, + "loss": 0.7955, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.43955022809984884, + "learning_rate": 0.0001939100190561601, + "loss": 0.7277, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.4730184206969309, + "learning_rate": 0.00019385049560415794, + "loss": 0.798, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.47261200218264654, + "learning_rate": 0.0001937906919003304, + "loss": 0.825, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.4958987350547001, + "learning_rate": 0.00019373060812326052, + "loss": 0.7624, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.45183616075090804, + "learning_rate": 0.00019367024445236754, + "loss": 0.8222, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.44803560898773614, + "learning_rate": 0.00019360960106790643, + "loss": 0.7226, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.42718829995419133, + "learning_rate": 0.0001935486781509677, + "loss": 0.7481, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4353186573981329, + "learning_rate": 0.00019348747588347637, + "loss": 0.7603, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.4202904819567657, + "learning_rate": 0.00019342599444819168, + "loss": 0.7449, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.4794847518043432, + "learning_rate": 0.00019336423402870653, + "loss": 0.773, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.45356141483247475, + "learning_rate": 0.00019330219480944694, + "loss": 0.7635, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.48737870973855363, + "learning_rate": 0.0001932398769756714, + "loss": 0.8234, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.4555062112337216, + "learning_rate": 0.0001931772807134704, + "loss": 0.7405, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4328351212745179, + "learning_rate": 0.00019311440620976597, + "loss": 0.6942, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.5617653746446652, + "learning_rate": 0.00019305125365231084, + "loss": 0.7743, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.42251896089010094, + "learning_rate": 0.00019298782322968815, + "loss": 0.7327, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5347497686840524, + "learning_rate": 0.0001929241151313108, + "loss": 0.8222, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.46392759769161884, + "learning_rate": 0.0001928601295474208, + "loss": 0.7476, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.46963090477790553, + "learning_rate": 0.00019279586666908884, + "loss": 0.7022, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.4266844164826448, + "learning_rate": 0.00019273132668821364, + "loss": 0.7656, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.41242368746109326, + "learning_rate": 0.00019266650979752136, + "loss": 0.8116, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.47546443364095325, + "learning_rate": 0.00019260141619056507, + "loss": 0.7875, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5292143795351046, + "learning_rate": 0.00019253604606172417, + "loss": 0.7666, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.47803327081719627, + "learning_rate": 0.0001924703996062038, + "loss": 0.784, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.4018308323084294, + "learning_rate": 0.0001924044770200342, + "loss": 0.7479, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.43518027414862276, + "learning_rate": 0.00019233827850007027, + "loss": 0.7893, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.45918856850372475, + "learning_rate": 0.0001922718042439908, + "loss": 0.7766, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.44310990709580944, + "learning_rate": 0.000192205054450298, + "loss": 0.7575, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4824186440304817, + "learning_rate": 0.00019213802931831696, + "loss": 0.8228, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.4164815411310583, + "learning_rate": 0.00019207072904819486, + "loss": 0.7584, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.4492882028447551, + "learning_rate": 0.00019200315384090044, + "loss": 0.7653, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4583365189022047, + "learning_rate": 0.00019193530389822363, + "loss": 0.7365, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.40883526606640835, + "learning_rate": 0.00019186717942277462, + "loss": 0.7414, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.4571007026103209, + "learning_rate": 0.00019179878061798347, + "loss": 0.7315, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.43915788183077686, + "learning_rate": 0.00019173010768809933, + "loss": 0.7649, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.42781757478259014, + "learning_rate": 0.00019166116083819002, + "loss": 0.7031, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.4194122860641237, + "learning_rate": 0.00019159194027414128, + "loss": 0.7347, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.3946453275568492, + "learning_rate": 0.0001915224462026563, + "loss": 0.7288, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.3716516600450698, + "learning_rate": 0.00019145267883125482, + "loss": 0.677, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.569373216702423, + "learning_rate": 0.00019138263836827288, + "loss": 0.789, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.3925415031456995, + "learning_rate": 0.00019131232502286188, + "loss": 0.723, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.4295146910368237, + "learning_rate": 0.00019124173900498818, + "loss": 0.7327, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.4391000814211482, + "learning_rate": 0.00019117088052543233, + "loss": 0.7953, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4974671790719608, + "learning_rate": 0.0001910997497957885, + "loss": 0.7985, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.4793462957844173, + "learning_rate": 0.00019102834702846387, + "loss": 0.7564, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.4205080963822408, + "learning_rate": 0.0001909566724366779, + "loss": 0.7161, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4442006612150125, + "learning_rate": 0.00019088472623446183, + "loss": 0.7848, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.4677563528998587, + "learning_rate": 0.00019081250863665794, + "loss": 0.8152, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.4463099795826012, + "learning_rate": 0.0001907400198589189, + "loss": 0.7082, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.41450881976512516, + "learning_rate": 0.00019066726011770726, + "loss": 0.8286, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.49032251343742833, + "learning_rate": 0.00019059422963029464, + "loss": 0.8278, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.5551963533783116, + "learning_rate": 0.0001905209286147611, + "loss": 0.7858, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4425974836672353, + "learning_rate": 0.0001904473572899947, + "loss": 0.7261, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.4308581660899142, + "learning_rate": 0.0001903735158756905, + "loss": 0.7759, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.4089398086125329, + "learning_rate": 0.0001902994045923502, + "loss": 0.7032, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.4478445524514376, + "learning_rate": 0.00019022502366128135, + "loss": 0.7864, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.4349562448937956, + "learning_rate": 0.0001901503733045967, + "loss": 0.6596, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.47145823286121696, + "learning_rate": 0.00019007545374521355, + "loss": 0.7355, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4527311758640081, + "learning_rate": 0.00019000026520685302, + "loss": 0.7405, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.5662376082754184, + "learning_rate": 0.00018992480791403958, + "loss": 0.7689, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.5303863637398115, + "learning_rate": 0.0001898490820921001, + "loss": 0.8052, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4457136180185171, + "learning_rate": 0.0001897730879671634, + "loss": 0.761, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.521176367976079, + "learning_rate": 0.0001896968257661595, + "loss": 0.7912, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.3993581009496419, + "learning_rate": 0.00018962029571681886, + "loss": 0.7552, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.482634638047166, + "learning_rate": 0.00018954349804767184, + "loss": 0.7378, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.47156012587256724, + "learning_rate": 0.00018946643298804793, + "loss": 0.7993, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.4529242253730277, + "learning_rate": 0.00018938910076807513, + "loss": 0.8097, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.43198723603603484, + "learning_rate": 0.00018931150161867916, + "loss": 0.7645, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.49769246836747966, + "learning_rate": 0.0001892336357715829, + "loss": 0.7583, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.44381661099224823, + "learning_rate": 0.0001891555034593055, + "loss": 0.8192, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.41365517886042213, + "learning_rate": 0.00018907710491516199, + "loss": 0.7281, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.39253340117724317, + "learning_rate": 0.00018899844037326225, + "loss": 0.7751, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.4221356965204189, + "learning_rate": 0.0001889195100685106, + "loss": 0.7335, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.465812608519683, + "learning_rate": 0.0001888403142366049, + "loss": 0.8006, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.39955412682780783, + "learning_rate": 0.00018876085311403593, + "loss": 0.7791, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.4198511360966401, + "learning_rate": 0.00018868112693808665, + "loss": 0.7713, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4683459551741011, + "learning_rate": 0.00018860113594683148, + "loss": 0.7975, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.524369646706343, + "learning_rate": 0.00018852088037913577, + "loss": 0.7755, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.4475292419332502, + "learning_rate": 0.0001884403604746547, + "loss": 0.7293, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.47005833665256663, + "learning_rate": 0.00018835957647383303, + "loss": 0.734, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.5459009790421003, + "learning_rate": 0.00018827852861790398, + "loss": 0.8221, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.4717398269488305, + "learning_rate": 0.00018819721714888877, + "loss": 0.7679, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4614656366933708, + "learning_rate": 0.00018811564230959588, + "loss": 0.6819, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.4426786261456807, + "learning_rate": 0.00018803380434362, + "loss": 0.7383, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.5046949310507889, + "learning_rate": 0.0001879517034953418, + "loss": 0.7637, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.5151554930335568, + "learning_rate": 0.00018786934000992688, + "loss": 0.8356, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.48056516871808136, + "learning_rate": 0.00018778671413332513, + "loss": 0.802, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.3669928978344368, + "learning_rate": 0.00018770382611226987, + "loss": 0.6344, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.38839417685216243, + "learning_rate": 0.00018762067619427746, + "loss": 0.6665, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.444571466544383, + "learning_rate": 0.000187537264627646, + "loss": 0.8017, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.48649755629417796, + "learning_rate": 0.00018745359166145523, + "loss": 0.7976, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.4674034942105575, + "learning_rate": 0.00018736965754556528, + "loss": 0.7909, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.40054121875355303, + "learning_rate": 0.00018728546253061614, + "loss": 0.7854, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.45543333246211487, + "learning_rate": 0.00018720100686802694, + "loss": 0.8092, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.43356723573270656, + "learning_rate": 0.00018711629080999504, + "loss": 0.7513, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.42987940452717943, + "learning_rate": 0.00018703131460949554, + "loss": 0.75, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.43154615134467134, + "learning_rate": 0.0001869460785202802, + "loss": 0.7715, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4577432897967549, + "learning_rate": 0.00018686058279687698, + "loss": 0.7883, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.4038076486111166, + "learning_rate": 0.00018677482769458904, + "loss": 0.7199, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.5876808158493642, + "learning_rate": 0.00018668881346949417, + "loss": 0.6808, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.4428224313753587, + "learning_rate": 0.00018660254037844388, + "loss": 0.7276, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.5834742907517645, + "learning_rate": 0.00018651600867906272, + "loss": 0.8507, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.4530072437109187, + "learning_rate": 0.00018642921862974742, + "loss": 0.7194, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.45015304766820907, + "learning_rate": 0.00018634217048966637, + "loss": 0.732, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.5145809525792678, + "learning_rate": 0.00018625486451875843, + "loss": 0.8305, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.4470293039575751, + "learning_rate": 0.0001861673009777325, + "loss": 0.7451, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4187471245853827, + "learning_rate": 0.0001860794801280666, + "loss": 0.7431, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.4542301308072446, + "learning_rate": 0.00018599140223200716, + "loss": 0.7999, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.4139522048552885, + "learning_rate": 0.0001859030675525681, + "loss": 0.7829, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.39572804667237355, + "learning_rate": 0.0001858144763535302, + "loss": 0.7474, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.42426366372189334, + "learning_rate": 0.0001857256288994402, + "loss": 0.7804, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.4998095801741916, + "learning_rate": 0.00018563652545561013, + "loss": 0.824, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.41234129174941464, + "learning_rate": 0.0001855471662881164, + "loss": 0.7441, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.38419386021960866, + "learning_rate": 0.000185457551663799, + "loss": 0.7612, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.5637900248272991, + "learning_rate": 0.00018536768185026083, + "loss": 0.7429, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.41953989413339743, + "learning_rate": 0.00018527755711586678, + "loss": 0.7709, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.45785135429480267, + "learning_rate": 0.00018518717772974302, + "loss": 0.7743, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.40332005903710416, + "learning_rate": 0.00018509654396177609, + "loss": 0.7278, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4304370314944406, + "learning_rate": 0.00018500565608261214, + "loss": 0.786, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.4134176865184048, + "learning_rate": 0.00018491451436365627, + "loss": 0.7411, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.44883302145505194, + "learning_rate": 0.0001848231190770714, + "loss": 0.7239, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.4446190605144744, + "learning_rate": 0.00018473147049577774, + "loss": 0.6804, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.4559578816460269, + "learning_rate": 0.00018463956889345194, + "loss": 0.7216, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.46692582047382547, + "learning_rate": 0.00018454741454452603, + "loss": 0.778, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4670038870928643, + "learning_rate": 0.00018445500772418697, + "loss": 0.7928, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.42271310945924956, + "learning_rate": 0.00018436234870837547, + "loss": 0.716, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.47097578907718746, + "learning_rate": 0.00018426943777378552, + "loss": 0.777, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.42074461461628854, + "learning_rate": 0.00018417627519786315, + "loss": 0.7355, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.3771891409228768, + "learning_rate": 0.00018408286125880604, + "loss": 0.705, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.4143114769704897, + "learning_rate": 0.00018398919623556238, + "loss": 0.7558, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.41762981116418973, + "learning_rate": 0.00018389528040783012, + "loss": 0.7619, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.4833105403522849, + "learning_rate": 0.0001838011140560562, + "loss": 0.7258, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.5263680970417046, + "learning_rate": 0.00018370669746143564, + "loss": 0.7977, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5252260664462001, + "learning_rate": 0.00018361203090591071, + "loss": 0.8086, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.4060831593752984, + "learning_rate": 0.0001835171146721701, + "loss": 0.6517, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.5183938396928125, + "learning_rate": 0.00018342194904364813, + "loss": 0.7967, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5223281194557258, + "learning_rate": 0.00018332653430452376, + "loss": 0.8407, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.539061397830276, + "learning_rate": 0.00018323087073971993, + "loss": 0.8545, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.4547422637203524, + "learning_rate": 0.00018313495863490258, + "loss": 0.7115, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.3878963454195641, + "learning_rate": 0.00018303879827647975, + "loss": 0.738, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.42732703361178465, + "learning_rate": 0.00018294238995160094, + "loss": 0.6755, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.43086787308731883, + "learning_rate": 0.00018284573394815597, + "loss": 0.7616, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.5263563910058618, + "learning_rate": 0.00018274883055477436, + "loss": 0.7579, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.46390603341389813, + "learning_rate": 0.00018265168006082437, + "loss": 0.7246, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4384442085538908, + "learning_rate": 0.00018255428275641214, + "loss": 0.6936, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.5084653458182219, + "learning_rate": 0.00018245663893238075, + "loss": 0.7694, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.42629750863324306, + "learning_rate": 0.0001823587488803095, + "loss": 0.7096, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.4174225185106244, + "learning_rate": 0.00018226061289251298, + "loss": 0.7668, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4848832911881881, + "learning_rate": 0.00018216223126204007, + "loss": 0.7749, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.5256712576294765, + "learning_rate": 0.00018206360428267332, + "loss": 0.8654, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.45369406635948906, + "learning_rate": 0.00018196473224892784, + "loss": 0.7768, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.4580442622619772, + "learning_rate": 0.00018186561545605054, + "loss": 0.7038, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.4121359019799272, + "learning_rate": 0.0001817662542000192, + "loss": 0.7529, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.4745036119867769, + "learning_rate": 0.0001816666487775416, + "loss": 0.7799, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4653160110390173, + "learning_rate": 0.00018156679948605467, + "loss": 0.7328, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.3900147216295794, + "learning_rate": 0.00018146670662372354, + "loss": 0.7004, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.5052791180641548, + "learning_rate": 0.0001813663704894407, + "loss": 0.8386, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.44377730164941653, + "learning_rate": 0.00018126579138282503, + "loss": 0.8182, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.4279979054066684, + "learning_rate": 0.00018116496960422107, + "loss": 0.7957, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.4178082284065879, + "learning_rate": 0.00018106390545469795, + "loss": 0.7324, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.48231159640949595, + "learning_rate": 0.0001809625992360485, + "loss": 0.7425, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.49797186124876386, + "learning_rate": 0.00018086105125078857, + "loss": 0.7728, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.4210454989697462, + "learning_rate": 0.00018075926180215576, + "loss": 0.708, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.43773426687044753, + "learning_rate": 0.00018065723119410884, + "loss": 0.8104, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.47375967549476183, + "learning_rate": 0.0001805549597313267, + "loss": 0.7881, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.4508248166569347, + "learning_rate": 0.0001804524477192075, + "loss": 0.7566, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.5109080365948736, + "learning_rate": 0.00018034969546386757, + "loss": 0.8352, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.4105244821037551, + "learning_rate": 0.00018024670327214084, + "loss": 0.6978, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.4770073011942472, + "learning_rate": 0.00018014347145157755, + "loss": 0.7503, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4476019882960167, + "learning_rate": 0.0001800400003104436, + "loss": 0.729, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.4337305503040713, + "learning_rate": 0.0001799362901577196, + "loss": 0.6877, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.4395490030745343, + "learning_rate": 0.00017983234130309968, + "loss": 0.7754, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4478885632204688, + "learning_rate": 0.00017972815405699103, + "loss": 0.7173, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.45970180329057325, + "learning_rate": 0.00017962372873051252, + "loss": 0.7813, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.46335197040398113, + "learning_rate": 0.00017951906563549397, + "loss": 0.7375, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.4778764846294453, + "learning_rate": 0.00017941416508447536, + "loss": 0.7646, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.46623072488367895, + "learning_rate": 0.00017930902739070562, + "loss": 0.7116, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.45062956425618433, + "learning_rate": 0.00017920365286814183, + "loss": 0.7567, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3901412272035564, + "learning_rate": 0.0001790980418314484, + "loss": 0.67, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.4481168814065678, + "learning_rate": 0.0001789921945959958, + "loss": 0.7487, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.45255804332698363, + "learning_rate": 0.00017888611147786002, + "loss": 0.7566, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.3858371436267006, + "learning_rate": 0.00017877979279382135, + "loss": 0.7214, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.3838906928309417, + "learning_rate": 0.00017867323886136348, + "loss": 0.7824, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.6921743256899109, + "learning_rate": 0.00017856644999867264, + "loss": 0.8048, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4723095825934299, + "learning_rate": 0.0001784594265246366, + "loss": 0.8044, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.4735570320108188, + "learning_rate": 0.00017835216875884368, + "loss": 0.6807, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.5826638776213384, + "learning_rate": 0.0001782446770215819, + "loss": 0.7849, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.45486726183484355, + "learning_rate": 0.0001781369516338378, + "loss": 0.8277, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.4997180204666084, + "learning_rate": 0.00017802899291729585, + "loss": 0.8167, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.4469017072685085, + "learning_rate": 0.0001779208011943371, + "loss": 0.734, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.4265899873892352, + "learning_rate": 0.00017781237678803847, + "loss": 0.7077, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.4665814125248521, + "learning_rate": 0.00017770372002217172, + "loss": 0.7479, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.5392805201410067, + "learning_rate": 0.00017759483122120238, + "loss": 0.7737, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.5366952879705709, + "learning_rate": 0.000177485710710289, + "loss": 0.8441, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.44525037077182855, + "learning_rate": 0.00017737635881528196, + "loss": 0.7602, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.42518163895662253, + "learning_rate": 0.00017726677586272263, + "loss": 0.7703, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4977376483405448, + "learning_rate": 0.00017715696217984235, + "loss": 0.7516, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.482970794527716, + "learning_rate": 0.00017704691809456143, + "loss": 0.7232, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.4457733986004114, + "learning_rate": 0.0001769366439354882, + "loss": 0.7644, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.44855878261627097, + "learning_rate": 0.00017682614003191807, + "loss": 0.8107, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.43853317843352935, + "learning_rate": 0.00017671540671383243, + "loss": 0.7253, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.5211453375036034, + "learning_rate": 0.0001766044443118978, + "loss": 0.8156, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.5456521312038222, + "learning_rate": 0.00017649325315746478, + "loss": 0.7169, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.44472418234899885, + "learning_rate": 0.00017638183358256696, + "loss": 0.7461, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.44575325016140327, + "learning_rate": 0.00017627018591992018, + "loss": 0.7025, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.4327582162947951, + "learning_rate": 0.0001761583105029213, + "loss": 0.7489, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.4141928177631284, + "learning_rate": 0.00017604620766564723, + "loss": 0.7049, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.6626481468166168, + "learning_rate": 0.00017593387774285412, + "loss": 0.8694, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.548728387711451, + "learning_rate": 0.00017582132106997616, + "loss": 0.8548, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.42774074789324845, + "learning_rate": 0.0001757085379831246, + "loss": 0.7204, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.4199250541649349, + "learning_rate": 0.00017559552881908695, + "loss": 0.7111, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.5061021079058381, + "learning_rate": 0.00017548229391532572, + "loss": 0.8057, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.4495334182361452, + "learning_rate": 0.00017536883360997743, + "loss": 0.7956, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.48165073581798634, + "learning_rate": 0.00017525514824185185, + "loss": 0.8259, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.41335385257603885, + "learning_rate": 0.00017514123815043074, + "loss": 0.7342, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.4145266461877525, + "learning_rate": 0.00017502710367586687, + "loss": 0.7595, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.44201502102589235, + "learning_rate": 0.0001749127451589832, + "loss": 0.7829, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.5203049947428746, + "learning_rate": 0.00017479816294127152, + "loss": 0.8107, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.3863436364378562, + "learning_rate": 0.00017468335736489177, + "loss": 0.6327, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.4551135421256658, + "learning_rate": 0.00017456832877267084, + "loss": 0.7279, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.46564114422191183, + "learning_rate": 0.0001744530775081015, + "loss": 0.778, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.48069249451736823, + "learning_rate": 0.00017433760391534167, + "loss": 0.735, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.477377028194487, + "learning_rate": 0.00017422190833921283, + "loss": 0.7508, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4939324041520078, + "learning_rate": 0.0001741059911251997, + "loss": 0.7438, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.40333028899327916, + "learning_rate": 0.00017398985261944856, + "loss": 0.6965, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.4391616856195648, + "learning_rate": 0.00017387349316876666, + "loss": 0.7382, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4158524659490832, + "learning_rate": 0.000173756913120621, + "loss": 0.71, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.4651585154255733, + "learning_rate": 0.0001736401128231373, + "loss": 0.7553, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.4304389835842047, + "learning_rate": 0.00017352309262509894, + "loss": 0.7894, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.40308975255561746, + "learning_rate": 0.00017340585287594604, + "loss": 0.6878, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.4258435882075208, + "learning_rate": 0.0001732883939257742, + "loss": 0.7406, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.419156478110173, + "learning_rate": 0.0001731707161253338, + "loss": 0.7157, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4050502111153129, + "learning_rate": 0.0001730528198260285, + "loss": 0.7601, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.4757091960255574, + "learning_rate": 0.00017293470537991463, + "loss": 0.7088, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.4805129071729885, + "learning_rate": 0.00017281637313969978, + "loss": 0.6709, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.4655840407847795, + "learning_rate": 0.00017269782345874203, + "loss": 0.7099, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.6808307954945988, + "learning_rate": 0.00017257905669104874, + "loss": 0.8317, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.4651069645499294, + "learning_rate": 0.00017246007319127545, + "loss": 0.7326, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.42048664296755117, + "learning_rate": 0.00017234087331472497, + "loss": 0.7201, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.4336395380929332, + "learning_rate": 0.00017222145741734626, + "loss": 0.7521, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.3820731757706012, + "learning_rate": 0.00017210182585573327, + "loss": 0.7068, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.43070761285149495, + "learning_rate": 0.00017198197898712404, + "loss": 0.7258, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.4793623382558248, + "learning_rate": 0.00017186191716939944, + "loss": 0.7424, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.4525946541267634, + "learning_rate": 0.0001717416407610824, + "loss": 0.7221, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4081739057196793, + "learning_rate": 0.00017162115012133643, + "loss": 0.7204, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.39358430542956696, + "learning_rate": 0.00017150044560996488, + "loss": 0.7097, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.39278965176432057, + "learning_rate": 0.00017137952758740978, + "loss": 0.664, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5334725427950925, + "learning_rate": 0.00017125839641475072, + "loss": 0.7787, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.4674629512590317, + "learning_rate": 0.00017113705245370368, + "loss": 0.7136, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.4687141573028758, + "learning_rate": 0.00017101549606662024, + "loss": 0.7125, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.48416110239095644, + "learning_rate": 0.00017089372761648616, + "loss": 0.7883, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.7565257686655121, + "learning_rate": 0.00017077174746692056, + "loss": 0.7748, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.383224478590731, + "learning_rate": 0.00017064955598217462, + "loss": 0.6952, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4036380929840031, + "learning_rate": 0.00017052715352713075, + "loss": 0.7542, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.46326149095647695, + "learning_rate": 0.00017040454046730115, + "loss": 0.7347, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.5846257957630842, + "learning_rate": 0.00017028171716882714, + "loss": 0.8552, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4465688747242848, + "learning_rate": 0.00017015868399847768, + "loss": 0.7976, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.3917549578289485, + "learning_rate": 0.00017003544132364846, + "loss": 0.6694, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.4048606026638533, + "learning_rate": 0.00016991198951236088, + "loss": 0.7049, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.44460583808185367, + "learning_rate": 0.00016978832893326074, + "loss": 0.7478, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.39128767229480316, + "learning_rate": 0.00016966445995561727, + "loss": 0.7546, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.37647276925663165, + "learning_rate": 0.00016954038294932216, + "loss": 0.6637, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4351759016292475, + "learning_rate": 0.00016941609828488807, + "loss": 0.7691, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.4182015452967113, + "learning_rate": 0.0001692916063334479, + "loss": 0.7539, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.42134974744242665, + "learning_rate": 0.0001691669074667535, + "loss": 0.6873, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.45725498098243716, + "learning_rate": 0.0001690420020571747, + "loss": 0.8192, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.44261361809918787, + "learning_rate": 0.0001689168904776979, + "loss": 0.7056, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.5130096571268379, + "learning_rate": 0.00016879157310192535, + "loss": 0.7339, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4008058826449141, + "learning_rate": 0.0001686660503040737, + "loss": 0.7322, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.45631894274308066, + "learning_rate": 0.00016854032245897308, + "loss": 0.7956, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.43336270108874525, + "learning_rate": 0.00016841438994206595, + "loss": 0.7363, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.47032489917062453, + "learning_rate": 0.00016828825312940592, + "loss": 0.7352, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.4125145135017204, + "learning_rate": 0.00016816191239765667, + "loss": 0.6969, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.4662370497753131, + "learning_rate": 0.00016803536812409075, + "loss": 0.7986, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3824746192067442, + "learning_rate": 0.0001679086206865886, + "loss": 0.7539, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.4394673484421157, + "learning_rate": 0.00016778167046363734, + "loss": 0.7932, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.3928956694847816, + "learning_rate": 0.00016765451783432953, + "loss": 0.7564, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.3794180412480094, + "learning_rate": 0.00016752716317836229, + "loss": 0.7352, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.4321572584394699, + "learning_rate": 0.0001673996068760359, + "loss": 0.7277, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.3886733474800513, + "learning_rate": 0.00016727184930825288, + "loss": 0.6527, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.49422080811805685, + "learning_rate": 0.0001671438908565167, + "loss": 0.8295, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.49166063763300444, + "learning_rate": 0.00016701573190293077, + "loss": 0.6551, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.47865172461051464, + "learning_rate": 0.00016688737283019706, + "loss": 0.6979, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.40959610643890904, + "learning_rate": 0.00016675881402161536, + "loss": 0.7106, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.4503720919421993, + "learning_rate": 0.00016663005586108176, + "loss": 0.8032, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.4498320174874359, + "learning_rate": 0.00016650109873308765, + "loss": 0.7532, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.47826486319096867, + "learning_rate": 0.0001663719430227186, + "loss": 0.7374, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.3978639353039526, + "learning_rate": 0.0001662425891156531, + "loss": 0.7523, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.48483046835985555, + "learning_rate": 0.00016611303739816168, + "loss": 0.7692, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.45089043354921626, + "learning_rate": 0.00016598328825710533, + "loss": 0.7443, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.4665950656664754, + "learning_rate": 0.00016585334207993476, + "loss": 0.7904, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.3883149125160683, + "learning_rate": 0.00016572319925468892, + "loss": 0.7179, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.41632379179992457, + "learning_rate": 0.000165592860169994, + "loss": 0.7281, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.4977495720997442, + "learning_rate": 0.0001654623252150624, + "loss": 0.7299, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.47903261526417296, + "learning_rate": 0.00016533159477969122, + "loss": 0.7304, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.37701238814284743, + "learning_rate": 0.00016520066925426144, + "loss": 0.7557, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.41930029095265514, + "learning_rate": 0.00016506954902973655, + "loss": 0.742, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.4188454431920848, + "learning_rate": 0.00016493823449766136, + "loss": 0.6864, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4468757996322642, + "learning_rate": 0.0001648067260501611, + "loss": 0.712, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.4343228600884407, + "learning_rate": 0.00016467502407993992, + "loss": 0.7297, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.40922799139319394, + "learning_rate": 0.0001645431289802799, + "loss": 0.6923, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4203326579304541, + "learning_rate": 0.0001644110411450398, + "loss": 0.7145, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.47335490523849333, + "learning_rate": 0.00016427876096865394, + "loss": 0.7525, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.41381219985814566, + "learning_rate": 0.00016414628884613107, + "loss": 0.7399, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4409350610415614, + "learning_rate": 0.00016401362517305296, + "loss": 0.681, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.3538655075484582, + "learning_rate": 0.00016388077034557355, + "loss": 0.7164, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.43471299509023836, + "learning_rate": 0.00016374772476041748, + "loss": 0.761, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4410425533885929, + "learning_rate": 0.00016361448881487914, + "loss": 0.7548, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.4511426783652151, + "learning_rate": 0.00016348106290682118, + "loss": 0.8016, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.4009363166019702, + "learning_rate": 0.00016334744743467364, + "loss": 0.7287, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.4775594919679725, + "learning_rate": 0.00016321364279743266, + "loss": 0.8506, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.45067055657521304, + "learning_rate": 0.00016307964939465914, + "loss": 0.7547, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.5112675671620446, + "learning_rate": 0.00016294546762647775, + "loss": 0.7682, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4307918320940786, + "learning_rate": 0.0001628110978935756, + "loss": 0.7041, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.43442178567013523, + "learning_rate": 0.0001626765405972011, + "loss": 0.7524, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.4668392119887552, + "learning_rate": 0.00016254179613916278, + "loss": 0.6954, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.5133283395950824, + "learning_rate": 0.00016240686492182804, + "loss": 0.8325, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.4417181247890331, + "learning_rate": 0.000162271747348122, + "loss": 0.7256, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.45562250143164257, + "learning_rate": 0.0001621364438215262, + "loss": 0.7251, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.44188741570717166, + "learning_rate": 0.00016200095474607753, + "loss": 0.7584, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.41305164806040257, + "learning_rate": 0.00016186528052636692, + "loss": 0.7725, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.4448580030745107, + "learning_rate": 0.0001617294215675382, + "loss": 0.7517, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.41272246176475724, + "learning_rate": 0.00016159337827528685, + "loss": 0.6428, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.427761714352196, + "learning_rate": 0.0001614571510558588, + "loss": 0.7489, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.41542254625356395, + "learning_rate": 0.00016132074031604917, + "loss": 0.6996, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.42220688807675394, + "learning_rate": 0.0001611841464632011, + "loss": 0.7579, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.3687717725151293, + "learning_rate": 0.00016104736990520468, + "loss": 0.7211, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.3881030069969, + "learning_rate": 0.0001609104110504954, + "loss": 0.7115, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3657955407253785, + "learning_rate": 0.0001607732703080532, + "loss": 0.7149, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.40800587674458866, + "learning_rate": 0.00016063594808740113, + "loss": 0.7142, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.42721068026872167, + "learning_rate": 0.00016049844479860422, + "loss": 0.7204, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.5371856046024449, + "learning_rate": 0.00016036076085226814, + "loss": 0.7918, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.362776360031712, + "learning_rate": 0.00016022289665953808, + "loss": 0.672, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.3889297840285753, + "learning_rate": 0.00016008485263209742, + "loss": 0.7069, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5222808118875192, + "learning_rate": 0.0001599466291821666, + "loss": 0.6683, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.47788481609241606, + "learning_rate": 0.0001598082267225018, + "loss": 0.8138, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.43098527220345256, + "learning_rate": 0.0001596696456663938, + "loss": 0.7268, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.4110374263224395, + "learning_rate": 0.0001595308864276666, + "loss": 0.6886, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.4260683340007289, + "learning_rate": 0.00015939194942067646, + "loss": 0.7213, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.38562882015100414, + "learning_rate": 0.0001592528350603103, + "loss": 0.6659, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.4545905517487075, + "learning_rate": 0.0001591135437619847, + "loss": 0.7252, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.42394997305272447, + "learning_rate": 0.00015897407594164467, + "loss": 0.7273, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.41157563157389504, + "learning_rate": 0.00015883443201576225, + "loss": 0.6836, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.44840001190614737, + "learning_rate": 0.0001586946124013354, + "loss": 0.7583, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.4182736826915793, + "learning_rate": 0.00015855461751588677, + "loss": 0.7195, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.4544480209267658, + "learning_rate": 0.0001584144477774623, + "loss": 0.7668, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5152676059003954, + "learning_rate": 0.0001582741036046301, + "loss": 0.7463, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.40967721945566854, + "learning_rate": 0.00015813358541647915, + "loss": 0.7725, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.4036255773867977, + "learning_rate": 0.00015799289363261813, + "loss": 0.6744, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.38950914089952277, + "learning_rate": 0.00015785202867317407, + "loss": 0.7673, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.4286416680955874, + "learning_rate": 0.00015771099095879108, + "loss": 0.7499, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.43590141467819987, + "learning_rate": 0.0001575697809106292, + "loss": 0.7338, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4910032323410539, + "learning_rate": 0.00015742839895036305, + "loss": 0.6965, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.3693106294847623, + "learning_rate": 0.00015728684550018064, + "loss": 0.6599, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.49096247961929523, + "learning_rate": 0.0001571451209827821, + "loss": 0.7749, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.39355592492079217, + "learning_rate": 0.00015700322582137827, + "loss": 0.6922, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.47682273617619825, + "learning_rate": 0.00015686116043968972, + "loss": 0.7402, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.4858149349113941, + "learning_rate": 0.00015671892526194516, + "loss": 0.7702, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.37638690329692875, + "learning_rate": 0.0001565765207128805, + "loss": 0.7093, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.4496997952146683, + "learning_rate": 0.0001564339472177373, + "loss": 0.6926, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.4156548552072922, + "learning_rate": 0.00015629120520226165, + "loss": 0.7482, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4139701893635094, + "learning_rate": 0.0001561482950927029, + "loss": 0.7038, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.42070868574423165, + "learning_rate": 0.0001560052173158123, + "loss": 0.7312, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.3996824303934374, + "learning_rate": 0.00015586197229884184, + "loss": 0.6693, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5498724293289201, + "learning_rate": 0.00015571856046954285, + "loss": 0.7898, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.39087687283609013, + "learning_rate": 0.00015557498225616487, + "loss": 0.6658, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.49763265105140403, + "learning_rate": 0.0001554312380874542, + "loss": 0.782, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.49971072384955406, + "learning_rate": 0.00015528732839265272, + "loss": 0.7626, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.471518962075809, + "learning_rate": 0.00015514325360149668, + "loss": 0.7155, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.46901778025938734, + "learning_rate": 0.0001549990141442153, + "loss": 0.7799, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.3915997285214422, + "learning_rate": 0.0001548546104515294, + "loss": 0.6841, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.3809616396925241, + "learning_rate": 0.00015471004295465035, + "loss": 0.6585, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.45379184123717903, + "learning_rate": 0.0001545653120852787, + "loss": 0.6928, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.39650350795032846, + "learning_rate": 0.00015442041827560274, + "loss": 0.7238, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.3948371161329786, + "learning_rate": 0.00015427536195829742, + "loss": 0.6788, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.4032691640595932, + "learning_rate": 0.00015413014356652286, + "loss": 0.7184, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.43571883502203596, + "learning_rate": 0.00015398476353392323, + "loss": 0.7005, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.4328528400250972, + "learning_rate": 0.00015383922229462549, + "loss": 0.7296, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.6203793014054366, + "learning_rate": 0.00015369352028323774, + "loss": 0.7943, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.42032184862200134, + "learning_rate": 0.00015354765793484834, + "loss": 0.7077, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.42287690028119707, + "learning_rate": 0.0001534016356850244, + "loss": 0.7419, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.3953099910639756, + "learning_rate": 0.0001532554539698105, + "loss": 0.7189, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.365946781409118, + "learning_rate": 0.00015310911322572753, + "loss": 0.6947, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.4121012123710881, + "learning_rate": 0.00015296261388977108, + "loss": 0.7287, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.40568950884016236, + "learning_rate": 0.0001528159563994104, + "loss": 0.7091, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.42927205307186717, + "learning_rate": 0.000152669141192587, + "loss": 0.7421, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.42045019660612665, + "learning_rate": 0.00015252216870771345, + "loss": 0.7248, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.4688332920501223, + "learning_rate": 0.00015237503938367186, + "loss": 0.7538, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4078824082008241, + "learning_rate": 0.00015222775365981273, + "loss": 0.6997, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.40740133303695125, + "learning_rate": 0.00015208031197595356, + "loss": 0.7137, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.45547854576707125, + "learning_rate": 0.0001519327147723776, + "loss": 0.7737, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.3594657384543693, + "learning_rate": 0.00015178496248983254, + "loss": 0.6625, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.4190519097986457, + "learning_rate": 0.0001516370555695291, + "loss": 0.6854, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.4113728495190875, + "learning_rate": 0.00015148899445313981, + "loss": 0.7386, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4265455138939629, + "learning_rate": 0.00015134077958279765, + "loss": 0.7259, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.4021569493622652, + "learning_rate": 0.00015119241140109467, + "loss": 0.6953, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.6083975824782651, + "learning_rate": 0.00015104389035108077, + "loss": 0.7177, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4249433086732558, + "learning_rate": 0.00015089521687626243, + "loss": 0.6917, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.44797184733914347, + "learning_rate": 0.0001507463914206012, + "loss": 0.7084, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.4488995057651193, + "learning_rate": 0.0001505974144285124, + "loss": 0.7712, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.5119712023325992, + "learning_rate": 0.000150448286344864, + "loss": 0.7138, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.4294619539709814, + "learning_rate": 0.00015029900761497506, + "loss": 0.7378, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.40191573399988767, + "learning_rate": 0.00015014957868461458, + "loss": 0.7069, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.39045686144937136, + "learning_rate": 0.00015000000000000001, + "loss": 0.6839, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.40775124902429194, + "learning_rate": 0.000149850272007796, + "loss": 0.7536, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.43690198174776207, + "learning_rate": 0.00014970039515511304, + "loss": 0.7217, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.497685578274488, + "learning_rate": 0.00014955036988950618, + "loss": 0.7582, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.3927489114501421, + "learning_rate": 0.0001494001966589736, + "loss": 0.648, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.3841325696674503, + "learning_rate": 0.00014924987591195547, + "loss": 0.6946, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3968719116520625, + "learning_rate": 0.00014909940809733222, + "loss": 0.7557, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.4644707180066225, + "learning_rate": 0.0001489487936644237, + "loss": 0.7221, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.47182237431179613, + "learning_rate": 0.00014879803306298736, + "loss": 0.6708, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.37405376397766577, + "learning_rate": 0.00014864712674321734, + "loss": 0.7303, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.38693014739459, + "learning_rate": 0.00014849607515574276, + "loss": 0.7205, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.4141446224776969, + "learning_rate": 0.00014834487875162657, + "loss": 0.6831, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.44085268407984707, + "learning_rate": 0.00014819353798236427, + "loss": 0.6771, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.5008038670322603, + "learning_rate": 0.00014804205329988225, + "loss": 0.7293, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.38289060985917467, + "learning_rate": 0.00014789042515653687, + "loss": 0.6895, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4750523039680865, + "learning_rate": 0.00014773865400511272, + "loss": 0.7094, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.43369605197655986, + "learning_rate": 0.00014758674029882152, + "loss": 0.7762, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.4384513524762407, + "learning_rate": 0.00014743468449130063, + "loss": 0.7413, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.41556593141549514, + "learning_rate": 0.00014728248703661182, + "loss": 0.7559, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.4313667616873304, + "learning_rate": 0.00014713014838923976, + "loss": 0.6445, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.37484474718641714, + "learning_rate": 0.00014697766900409074, + "loss": 0.6437, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.41599837666242073, + "learning_rate": 0.00014682504933649144, + "loss": 0.7088, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.40276103245754896, + "learning_rate": 0.0001466722898421873, + "loss": 0.6754, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.4168388924555747, + "learning_rate": 0.0001465193909773413, + "loss": 0.7518, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.42526591621099963, + "learning_rate": 0.00014636635319853275, + "loss": 0.6827, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.40288589146266274, + "learning_rate": 0.00014621317696275564, + "loss": 0.7652, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.467329947585081, + "learning_rate": 0.00014605986272741748, + "loss": 0.7806, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.3876255767858793, + "learning_rate": 0.00014590641095033787, + "loss": 0.6763, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.4146466363686245, + "learning_rate": 0.00014575282208974702, + "loss": 0.7033, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.4441170977868678, + "learning_rate": 0.00014559909660428468, + "loss": 0.6681, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3558480903224922, + "learning_rate": 0.00014544523495299842, + "loss": 0.6505, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.41602598410389735, + "learning_rate": 0.00014529123759534255, + "loss": 0.7015, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.3703330729862911, + "learning_rate": 0.00014513710499117647, + "loss": 0.6848, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4263287984363347, + "learning_rate": 0.0001449828376007636, + "loss": 0.7294, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.44347359831237204, + "learning_rate": 0.00014482843588476974, + "loss": 0.698, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.48781838992265036, + "learning_rate": 0.00014467390030426186, + "loss": 0.7447, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.4609996397724964, + "learning_rate": 0.0001445192313207067, + "loss": 0.6821, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.4713648588980301, + "learning_rate": 0.0001443644293959693, + "loss": 0.7602, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.41196452629535585, + "learning_rate": 0.00014420949499231172, + "loss": 0.7182, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5632700705840292, + "learning_rate": 0.0001440544285723915, + "loss": 0.8152, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.3998184746916645, + "learning_rate": 0.00014389923059926062, + "loss": 0.6491, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.41509955532967097, + "learning_rate": 0.0001437439015363638, + "loss": 0.7134, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.4868921880399975, + "learning_rate": 0.00014358844184753712, + "loss": 0.731, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.46637564860401476, + "learning_rate": 0.00014343285199700683, + "loss": 0.7037, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.4516473152763635, + "learning_rate": 0.0001432771324493879, + "loss": 0.7053, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3964393979907646, + "learning_rate": 0.00014312128366968243, + "loss": 0.6835, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.3997508055133865, + "learning_rate": 0.00014296530612327863, + "loss": 0.7097, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.3983774071109117, + "learning_rate": 0.00014280920027594907, + "loss": 0.7137, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.45146135429403017, + "learning_rate": 0.00014265296659384956, + "loss": 0.7395, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.46216153520539915, + "learning_rate": 0.00014249660554351752, + "loss": 0.7127, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.37503357112005953, + "learning_rate": 0.00014234011759187083, + "loss": 0.6553, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.5407589874243327, + "learning_rate": 0.00014218350320620624, + "loss": 0.6839, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.4426383635037013, + "learning_rate": 0.00014202676285419812, + "loss": 0.7556, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.4351030781652491, + "learning_rate": 0.00014186989700389687, + "loss": 0.7125, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.42703783138354634, + "learning_rate": 0.0001417129061237278, + "loss": 0.7657, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.37859228984153764, + "learning_rate": 0.0001415557906824895, + "loss": 0.6771, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.47642453052703476, + "learning_rate": 0.00014139855114935252, + "loss": 0.7171, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.4300388925618423, + "learning_rate": 0.00014124118799385796, + "loss": 0.6921, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.4860275460776051, + "learning_rate": 0.0001410837016859161, + "loss": 0.7622, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.37535010226201987, + "learning_rate": 0.00014092609269580496, + "loss": 0.7053, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.3976404676748777, + "learning_rate": 0.00014076836149416887, + "loss": 0.694, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.41977233221720256, + "learning_rate": 0.00014061050855201723, + "loss": 0.7382, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.44837616499019745, + "learning_rate": 0.0001404525343407228, + "loss": 0.7186, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.46263973386021645, + "learning_rate": 0.0001402944393320206, + "loss": 0.6689, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.45182531508138, + "learning_rate": 0.00014013622399800627, + "loss": 0.7417, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.39492668083723853, + "learning_rate": 0.00013997788881113489, + "loss": 0.7215, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4523013919145976, + "learning_rate": 0.00013981943424421932, + "loss": 0.7684, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.4381382220440621, + "learning_rate": 0.0001396608607704289, + "loss": 0.7339, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.3970523596266796, + "learning_rate": 0.0001395021688632882, + "loss": 0.7418, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4347272847239781, + "learning_rate": 0.00013934335899667527, + "loss": 0.7787, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.4130918424648687, + "learning_rate": 0.00013918443164482046, + "loss": 0.7127, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.44002182055166045, + "learning_rate": 0.000139025387282305, + "loss": 0.7232, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.49877895452345106, + "learning_rate": 0.00013886622638405952, + "loss": 0.7238, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.39406534474061694, + "learning_rate": 0.0001387069494253626, + "loss": 0.6675, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.45754995204049975, + "learning_rate": 0.0001385475568818394, + "loss": 0.6866, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.40723032246884827, + "learning_rate": 0.00013838804922946027, + "loss": 0.727, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.4959940655631805, + "learning_rate": 0.00013822842694453924, + "loss": 0.7722, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.5060320422283979, + "learning_rate": 0.0001380686905037327, + "loss": 0.7185, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4382737101204542, + "learning_rate": 0.00013790884038403795, + "loss": 0.7016, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.4632557011709129, + "learning_rate": 0.00013774887706279165, + "loss": 0.717, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.37928916344341357, + "learning_rate": 0.0001375888010176686, + "loss": 0.7077, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4988609159170412, + "learning_rate": 0.00013742861272668012, + "loss": 0.713, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.5035465539807298, + "learning_rate": 0.00013726831266817278, + "loss": 0.7755, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.4243815881999628, + "learning_rate": 0.00013710790132082692, + "loss": 0.7035, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.40511097729163625, + "learning_rate": 0.00013694737916365517, + "loss": 0.6997, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.34732451466396097, + "learning_rate": 0.00013678674667600102, + "loss": 0.5713, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.4239987844039931, + "learning_rate": 0.00013662600433753745, + "loss": 0.6592, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.43841975631729396, + "learning_rate": 0.00013646515262826552, + "loss": 0.7232, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.42246102198299307, + "learning_rate": 0.00013630419202851284, + "loss": 0.6888, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.5032841321184606, + "learning_rate": 0.00013614312301893223, + "loss": 0.7561, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.4905327312132084, + "learning_rate": 0.0001359819460805001, + "loss": 0.7223, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.43946961949901897, + "learning_rate": 0.00013582066169451535, + "loss": 0.7665, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.4402067407496789, + "learning_rate": 0.0001356592703425976, + "loss": 0.7159, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.47176887532052597, + "learning_rate": 0.0001354977725066859, + "loss": 0.7249, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.4476564253342013, + "learning_rate": 0.00013533616866903735, + "loss": 0.7531, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.46488437306735336, + "learning_rate": 0.0001351744593122255, + "loss": 0.7542, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.44826828994452866, + "learning_rate": 0.00013501264491913906, + "loss": 0.7475, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.4392302673067083, + "learning_rate": 0.00013485072597298038, + "loss": 0.7496, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.4284828258125813, + "learning_rate": 0.00013468870295726398, + "loss": 0.7599, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.39250654426022963, + "learning_rate": 0.0001345265763558152, + "loss": 0.7007, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.3734484227669392, + "learning_rate": 0.00013436434665276865, + "loss": 0.631, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.4854947563926426, + "learning_rate": 0.00013420201433256689, + "loss": 0.7066, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.4055127583755472, + "learning_rate": 0.00013403957987995882, + "loss": 0.6928, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.45396074749692356, + "learning_rate": 0.00013387704377999842, + "loss": 0.7343, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.4296512549915837, + "learning_rate": 0.00013371440651804313, + "loss": 0.6739, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4711535732387848, + "learning_rate": 0.0001335516685797525, + "loss": 0.693, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.4267546928980294, + "learning_rate": 0.00013338883045108674, + "loss": 0.6887, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.4739593317052299, + "learning_rate": 0.00013322589261830517, + "loss": 0.7671, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.429430590107906, + "learning_rate": 0.00013306285556796495, + "loss": 0.7383, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.42881367720701874, + "learning_rate": 0.0001328997197869194, + "loss": 0.7119, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.4615298357246681, + "learning_rate": 0.0001327364857623168, + "loss": 0.7062, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.467141491830004, + "learning_rate": 0.00013257315398159864, + "loss": 0.7164, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.5202135770227491, + "learning_rate": 0.00013240972493249847, + "loss": 0.7071, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.4475869027765437, + "learning_rate": 0.0001322461991030402, + "loss": 0.6947, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.44295597600268727, + "learning_rate": 0.00013208257698153677, + "loss": 0.7058, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.47176094451362777, + "learning_rate": 0.00013191885905658872, + "loss": 0.7165, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.43358146949114096, + "learning_rate": 0.0001317550458170826, + "loss": 0.7151, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.4412211134754077, + "learning_rate": 0.00013159113775218964, + "loss": 0.7551, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.4410286514374678, + "learning_rate": 0.00013142713535136414, + "loss": 0.6723, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.4216559262181823, + "learning_rate": 0.00013126303910434214, + "loss": 0.7483, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.3940472095500932, + "learning_rate": 0.00013109884950114007, + "loss": 0.7078, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.4027435998274283, + "learning_rate": 0.00013093456703205288, + "loss": 0.661, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.3969104111607289, + "learning_rate": 0.00013077019218765305, + "loss": 0.6883, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4824453554696596, + "learning_rate": 0.00013060572545878875, + "loss": 0.7817, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.44449554922837264, + "learning_rate": 0.0001304411673365826, + "loss": 0.7433, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.43347524170875906, + "learning_rate": 0.0001302765183124302, + "loss": 0.7109, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.39878441525322905, + "learning_rate": 0.00013011177887799845, + "loss": 0.6622, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.4463980260611668, + "learning_rate": 0.00012994694952522435, + "loss": 0.745, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.47312945243201604, + "learning_rate": 0.00012978203074631334, + "loss": 0.7728, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.41318675822793455, + "learning_rate": 0.00012961702303373795, + "loss": 0.6108, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.46171742829872736, + "learning_rate": 0.00012945192688023624, + "loss": 0.7379, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.44500590778977384, + "learning_rate": 0.0001292867427788104, + "loss": 0.6901, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.37258176457460235, + "learning_rate": 0.00012912147122272523, + "loss": 0.6797, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.4277090342743537, + "learning_rate": 0.00012895611270550666, + "loss": 0.7537, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.41455736640289276, + "learning_rate": 0.0001287906677209403, + "loss": 0.7556, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3878051589700941, + "learning_rate": 0.00012862513676307008, + "loss": 0.6958, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.4371718185766127, + "learning_rate": 0.0001284595203261965, + "loss": 0.7673, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.4416283994462147, + "learning_rate": 0.00012829381890487536, + "loss": 0.6506, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3904046349843101, + "learning_rate": 0.00012812803299391628, + "loss": 0.7024, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.3627789630617062, + "learning_rate": 0.00012796216308838117, + "loss": 0.6675, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.46654878719546106, + "learning_rate": 0.00012779620968358273, + "loss": 0.7709, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.38495662223781735, + "learning_rate": 0.00012763017327508305, + "loss": 0.6598, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.38161629022718274, + "learning_rate": 0.00012746405435869198, + "loss": 0.7313, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.39429952551911845, + "learning_rate": 0.00012729785343046588, + "loss": 0.6631, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.443184825535912, + "learning_rate": 0.0001271315709867059, + "loss": 0.7093, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.4839129942053279, + "learning_rate": 0.00012696520752395672, + "loss": 0.7042, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.41334895955463824, + "learning_rate": 0.00012679876353900482, + "loss": 0.7246, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.4863180689827426, + "learning_rate": 0.00012663223952887723, + "loss": 0.7449, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.3820454923774122, + "learning_rate": 0.00012646563599083996, + "loss": 0.6725, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.4220214371529139, + "learning_rate": 0.00012629895342239643, + "loss": 0.7403, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.41454752364842296, + "learning_rate": 0.00012613219232128608, + "loss": 0.7249, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.42604154192156635, + "learning_rate": 0.00012596535318548289, + "loss": 0.7378, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.43228897217996826, + "learning_rate": 0.0001257984365131938, + "loss": 0.7029, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.40739908117620516, + "learning_rate": 0.00012563144280285741, + "loss": 0.6814, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.4634356318463634, + "learning_rate": 0.00012546437255314222, + "loss": 0.7377, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.5959063980403425, + "learning_rate": 0.0001252972262629454, + "loss": 0.8282, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3897472418811074, + "learning_rate": 0.00012513000443139112, + "loss": 0.6303, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.3612063612791084, + "learning_rate": 0.00012496270755782914, + "loss": 0.6497, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.36916970218865225, + "learning_rate": 0.00012479533614183334, + "loss": 0.7053, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.40208731910393475, + "learning_rate": 0.00012462789068320017, + "loss": 0.6698, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.42344557694377094, + "learning_rate": 0.00012446037168194714, + "loss": 0.6714, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.41985601136707557, + "learning_rate": 0.00012429277963831148, + "loss": 0.6884, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.43303368899400574, + "learning_rate": 0.00012412511505274844, + "loss": 0.6791, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.4120029133921381, + "learning_rate": 0.00012395737842592995, + "loss": 0.7271, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.3746594000082755, + "learning_rate": 0.000123789570258743, + "loss": 0.6436, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4039068035463203, + "learning_rate": 0.00012362169105228826, + "loss": 0.7029, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.41184612698087747, + "learning_rate": 0.00012345374130787854, + "loss": 0.7111, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.38301489631977276, + "learning_rate": 0.00012328572152703725, + "loss": 0.6534, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.37525032785125834, + "learning_rate": 0.000123117632211497, + "loss": 0.6434, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.456722632978626, + "learning_rate": 0.00012294947386319794, + "loss": 0.7518, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.47633067924491773, + "learning_rate": 0.0001227812469842864, + "loss": 0.7149, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.465979164027581, + "learning_rate": 0.00012261295207711346, + "loss": 0.7662, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.4091999980397747, + "learning_rate": 0.00012244458964423327, + "loss": 0.6956, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.4635916658556116, + "learning_rate": 0.00012227616018840154, + "loss": 0.7667, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3658341568913632, + "learning_rate": 0.0001221076642125742, + "loss": 0.6953, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.37921643901310315, + "learning_rate": 0.00012193910221990581, + "loss": 0.6237, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.4131134541607325, + "learning_rate": 0.00012177047471374807, + "loss": 0.6657, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.4239805190357618, + "learning_rate": 0.00012160178219764837, + "loss": 0.7111, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.462761873466164, + "learning_rate": 0.0001214330251753481, + "loss": 0.707, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.4022517047204756, + "learning_rate": 0.00012126420415078132, + "loss": 0.6648, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.43799279337032937, + "learning_rate": 0.00012109531962807332, + "loss": 0.6855, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.4251834274788315, + "learning_rate": 0.00012092637211153885, + "loss": 0.7075, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.41225338694188035, + "learning_rate": 0.0001207573621056809, + "loss": 0.6909, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4070184019232702, + "learning_rate": 0.00012058829011518896, + "loss": 0.6525, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.41649416449245563, + "learning_rate": 0.00012041915664493761, + "loss": 0.7291, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.4546324691909841, + "learning_rate": 0.00012024996219998517, + "loss": 0.7653, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.45141669122155437, + "learning_rate": 0.00012008070728557186, + "loss": 0.7051, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.5444018336338412, + "learning_rate": 0.00011991139240711857, + "loss": 0.7419, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.43065778637280483, + "learning_rate": 0.00011974201807022525, + "loss": 0.7176, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.38167947206238456, + "learning_rate": 0.00011957258478066931, + "loss": 0.6412, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.4505920393626082, + "learning_rate": 0.00011940309304440433, + "loss": 0.685, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.43960240840849185, + "learning_rate": 0.00011923354336755835, + "loss": 0.7047, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.39407494617000993, + "learning_rate": 0.00011906393625643244, + "loss": 0.6705, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.4114040227268409, + "learning_rate": 0.00011889427221749916, + "loss": 0.6595, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.4612087998564698, + "learning_rate": 0.00011872455175740112, + "loss": 0.7243, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.43932608382970695, + "learning_rate": 0.00011855477538294935, + "loss": 0.7436, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.37415790553810785, + "learning_rate": 0.00011838494360112185, + "loss": 0.6556, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.4504569090557854, + "learning_rate": 0.00011821505691906216, + "loss": 0.6982, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.44034490188774617, + "learning_rate": 0.00011804511584407763, + "loss": 0.6777, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.4736985341131689, + "learning_rate": 0.00011787512088363817, + "loss": 0.6765, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.4677624856624705, + "learning_rate": 0.00011770507254537453, + "loss": 0.7287, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4395533085154452, + "learning_rate": 0.00011753497133707679, + "loss": 0.696, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.39550018227943234, + "learning_rate": 0.00011736481776669306, + "loss": 0.6881, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.47395930050205876, + "learning_rate": 0.00011719461234232764, + "loss": 0.7316, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.39765873650882605, + "learning_rate": 0.00011702435557223987, + "loss": 0.6956, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.4891652376554147, + "learning_rate": 0.00011685404796484225, + "loss": 0.7982, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.4232612644035031, + "learning_rate": 0.00011668369002869912, + "loss": 0.6723, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.38762156442078516, + "learning_rate": 0.00011651328227252517, + "loss": 0.6667, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.4909127349481194, + "learning_rate": 0.00011634282520518383, + "loss": 0.742, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.41671132795123844, + "learning_rate": 0.00011617231933568578, + "loss": 0.7225, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4856652200088908, + "learning_rate": 0.00011600176517318741, + "loss": 0.7869, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.5121480118222634, + "learning_rate": 0.00011583116322698935, + "loss": 0.7202, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.40668954918811323, + "learning_rate": 0.00011566051400653486, + "loss": 0.7713, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.37095807153206234, + "learning_rate": 0.00011548981802140848, + "loss": 0.6935, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.45446232774625944, + "learning_rate": 0.00011531907578133429, + "loss": 0.7621, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.3887007782251961, + "learning_rate": 0.00011514828779617459, + "loss": 0.7416, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.45157365228556273, + "learning_rate": 0.00011497745457592816, + "loss": 0.6708, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.4032785843368036, + "learning_rate": 0.00011480657663072896, + "loss": 0.6955, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.40866998088501677, + "learning_rate": 0.00011463565447084445, + "loss": 0.6839, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3827212503725845, + "learning_rate": 0.00011446468860667421, + "loss": 0.6638, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.423243115774533, + "learning_rate": 0.00011429367954874819, + "loss": 0.657, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.40384524585114895, + "learning_rate": 0.0001141226278077254, + "loss": 0.6638, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.39767400585791324, + "learning_rate": 0.00011395153389439233, + "loss": 0.7069, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.4573851491431221, + "learning_rate": 0.00011378039831966134, + "loss": 0.6972, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.46646520145860276, + "learning_rate": 0.00011360922159456928, + "loss": 0.7123, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3730865894803738, + "learning_rate": 0.00011343800423027582, + "loss": 0.7056, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.3910630492654369, + "learning_rate": 0.00011326674673806195, + "loss": 0.6066, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.4278308966101939, + "learning_rate": 0.00011309544962932862, + "loss": 0.7611, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.42457588753017367, + "learning_rate": 0.0001129241134155949, + "loss": 0.6762, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.42832922115693706, + "learning_rate": 0.00011275273860849684, + "loss": 0.6467, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.464530242798856, + "learning_rate": 0.00011258132571978555, + "loss": 0.6688, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.38472469459638076, + "learning_rate": 0.00011240987526132594, + "loss": 0.6841, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.38849071920916217, + "learning_rate": 0.00011223838774509514, + "loss": 0.6736, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.44907458299897185, + "learning_rate": 0.00011206686368318086, + "loss": 0.6982, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4565938291447505, + "learning_rate": 0.00011189530358778005, + "loss": 0.6869, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.3793537012311367, + "learning_rate": 0.00011172370797119712, + "loss": 0.7202, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.42887604556878534, + "learning_rate": 0.00011155207734584263, + "loss": 0.7523, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.46717937772712853, + "learning_rate": 0.00011138041222423177, + "loss": 0.7172, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.3764414926422453, + "learning_rate": 0.00011120871311898254, + "loss": 0.6331, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.4429214812780887, + "learning_rate": 0.0001110369805428146, + "loss": 0.6603, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.41987417589997855, + "learning_rate": 0.00011086521500854745, + "loss": 0.7133, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.5210846787709746, + "learning_rate": 0.0001106934170290991, + "loss": 0.7689, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.4141037873790039, + "learning_rate": 0.00011052158711748434, + "loss": 0.6461, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4782885729084308, + "learning_rate": 0.00011034972578681338, + "loss": 0.7443, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.40144968575883505, + "learning_rate": 0.00011017783355029026, + "loss": 0.7076, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.4024462129348643, + "learning_rate": 0.00011000591092121127, + "loss": 0.6517, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.46684997838951175, + "learning_rate": 0.00010983395841296348, + "loss": 0.6774, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.4319143202542929, + "learning_rate": 0.0001096619765390232, + "loss": 0.7165, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.3914237765407882, + "learning_rate": 0.00010948996581295436, + "loss": 0.705, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3916173259933699, + "learning_rate": 0.00010931792674840718, + "loss": 0.6757, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.39956961393530177, + "learning_rate": 0.00010914585985911632, + "loss": 0.7288, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.3880763668296263, + "learning_rate": 0.00010897376565889971, + "loss": 0.7006, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.38121867310237195, + "learning_rate": 0.00010880164466165674, + "loss": 0.6636, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.4311920380139067, + "learning_rate": 0.00010862949738136681, + "loss": 0.6683, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.3846473660057849, + "learning_rate": 0.00010845732433208779, + "loss": 0.6299, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3843852749197285, + "learning_rate": 0.00010828512602795462, + "loss": 0.5507, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.4099112208585509, + "learning_rate": 0.00010811290298317755, + "loss": 0.6898, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.4239536693345123, + "learning_rate": 0.00010794065571204072, + "loss": 0.6707, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.39020478363358896, + "learning_rate": 0.00010776838472890065, + "loss": 0.6282, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.4138631017383004, + "learning_rate": 0.00010759609054818458, + "loss": 0.6705, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.39382180684827073, + "learning_rate": 0.00010742377368438914, + "loss": 0.6753, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.41063973948385085, + "learning_rate": 0.00010725143465207867, + "loss": 0.7252, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.45590420113087754, + "learning_rate": 0.00010707907396588361, + "loss": 0.7075, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.479092633471132, + "learning_rate": 0.0001069066921404992, + "loss": 0.756, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.40947527248349513, + "learning_rate": 0.00010673428969068364, + "loss": 0.736, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.41755992091951427, + "learning_rate": 0.00010656186713125689, + "loss": 0.6764, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.5008756508436008, + "learning_rate": 0.0001063894249770989, + "loss": 0.7461, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.4016638669500053, + "learning_rate": 0.00010621696374314807, + "loss": 0.6632, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.4324061289766408, + "learning_rate": 0.00010604448394439983, + "loss": 0.6477, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.4412168422297335, + "learning_rate": 0.00010587198609590505, + "loss": 0.741, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3984762587813605, + "learning_rate": 0.00010569947071276847, + "loss": 0.6679, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.449656501907126, + "learning_rate": 0.00010552693831014726, + "loss": 0.6845, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.3764081526800237, + "learning_rate": 0.0001053543894032493, + "loss": 0.6395, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4255182047758748, + "learning_rate": 0.00010518182450733186, + "loss": 0.7093, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.3999003305188967, + "learning_rate": 0.00010500924413769988, + "loss": 0.6587, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.4807963299194944, + "learning_rate": 0.00010483664880970457, + "loss": 0.7051, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.3845495646724184, + "learning_rate": 0.00010466403903874176, + "loss": 0.6897, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.4228148908480343, + "learning_rate": 0.00010449141534025045, + "loss": 0.6939, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.41707744163602367, + "learning_rate": 0.00010431877822971117, + "loss": 0.6927, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4075836226546671, + "learning_rate": 0.00010414612822264455, + "loss": 0.7014, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.3979750044490217, + "learning_rate": 0.00010397346583460971, + "loss": 0.6854, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.405131010054494, + "learning_rate": 0.0001038007915812028, + "loss": 0.6202, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.4208582594190111, + "learning_rate": 0.00010362810597805526, + "loss": 0.6775, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.4422173215338135, + "learning_rate": 0.0001034554095408326, + "loss": 0.6463, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.4583800416518211, + "learning_rate": 0.00010328270278523256, + "loss": 0.7189, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4663487212335978, + "learning_rate": 0.0001031099862269837, + "loss": 0.7068, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.6720076843435544, + "learning_rate": 0.00010293726038184393, + "loss": 0.8099, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.41453475629692693, + "learning_rate": 0.00010276452576559879, + "loss": 0.7004, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4838049864154515, + "learning_rate": 0.00010259178289406011, + "loss": 0.7132, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.38956248992999926, + "learning_rate": 0.00010241903228306431, + "loss": 0.6566, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.4788107091066432, + "learning_rate": 0.0001022462744484709, + "loss": 0.7609, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.42449048921034255, + "learning_rate": 0.00010207350990616107, + "loss": 0.6608, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.38315009506219316, + "learning_rate": 0.00010190073917203589, + "loss": 0.6547, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.444624385828526, + "learning_rate": 0.00010172796276201503, + "loss": 0.6875, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.38204646337438464, + "learning_rate": 0.0001015551811920351, + "loss": 0.6206, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.3892107373391419, + "learning_rate": 0.00010138239497804804, + "loss": 0.6646, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.4612981285596248, + "learning_rate": 0.00010120960463601976, + "loss": 0.6786, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.37446944446069946, + "learning_rate": 0.00010103681068192845, + "loss": 0.6884, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.4777690939015667, + "learning_rate": 0.00010086401363176305, + "loss": 0.7443, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.37431633724955565, + "learning_rate": 0.00010069121400152181, + "loss": 0.6579, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.48271017790023146, + "learning_rate": 0.00010051841230721065, + "loss": 0.7562, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.3137843926939316, + "learning_rate": 0.0001003456090648416, + "loss": 0.6008, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.48926191422711196, + "learning_rate": 0.00010017280479043147, + "loss": 0.7124, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4840106358402935, + "learning_rate": 0.0001, + "loss": 0.6094, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.4109365942571878, + "learning_rate": 9.982719520956855e-05, + "loss": 0.6893, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.459250743064764, + "learning_rate": 9.965439093515841e-05, + "loss": 0.7158, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.37539218065461527, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7344, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.45438928402880036, + "learning_rate": 9.930878599847821e-05, + "loss": 0.7003, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.3850373296029251, + "learning_rate": 9.913598636823693e-05, + "loss": 0.6461, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.44240440247711077, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7101, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.42555419166065234, + "learning_rate": 9.879039536398024e-05, + "loss": 0.7101, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.40798177588241413, + "learning_rate": 9.861760502195197e-05, + "loss": 0.6487, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.4794810474997049, + "learning_rate": 9.844481880796491e-05, + "loss": 0.624, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.33123398004892485, + "learning_rate": 9.827203723798498e-05, + "loss": 0.6161, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.43091338801889584, + "learning_rate": 9.809926082796415e-05, + "loss": 0.6245, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3982056048141141, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6311, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.4264119518275276, + "learning_rate": 9.775372555152912e-05, + "loss": 0.7126, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.4650117198486698, + "learning_rate": 9.758096771693573e-05, + "loss": 0.6909, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.3855651504866148, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6563, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.4004299561285694, + "learning_rate": 9.723547423440122e-05, + "loss": 0.6418, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.436475294722715, + "learning_rate": 9.70627396181561e-05, + "loss": 0.68, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4201228255812106, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7161, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.44028062963521497, + "learning_rate": 9.671729721476746e-05, + "loss": 0.6964, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.39430579219492956, + "learning_rate": 9.654459045916743e-05, + "loss": 0.6884, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.41841383153582246, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7072, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.39579772140427333, + "learning_rate": 9.619920841879725e-05, + "loss": 0.7173, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.37978904842785305, + "learning_rate": 9.602653416539031e-05, + "loss": 0.6913, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.44857958673888443, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6634, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.4136062585434959, + "learning_rate": 9.568122177028884e-05, + "loss": 0.6516, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.40079332433727344, + "learning_rate": 9.550858465974958e-05, + "loss": 0.6692, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.5191574423285716, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7063, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.4489649211382724, + "learning_rate": 9.516335119029546e-05, + "loss": 0.6717, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.359869051326043, + "learning_rate": 9.499075586230013e-05, + "loss": 0.6032, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3655288390710915, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6612, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.36150180799514897, + "learning_rate": 9.464561059675073e-05, + "loss": 0.6246, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.39936177766924974, + "learning_rate": 9.44730616898528e-05, + "loss": 0.7136, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4561306271934777, + "learning_rate": 9.430052928723153e-05, + "loss": 0.66, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4339140449482403, + "learning_rate": 9.412801390409497e-05, + "loss": 0.6832, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.49580888230308284, + "learning_rate": 9.395551605560018e-05, + "loss": 0.6612, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.449576174096107, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6667, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.47223661195255384, + "learning_rate": 9.361057502290113e-05, + "loss": 0.6377, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.38485172720289945, + "learning_rate": 9.343813286874312e-05, + "loss": 0.6492, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.38502627740041523, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6893, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.37223090200132836, + "learning_rate": 9.309330785950086e-05, + "loss": 0.6582, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.5748748140440352, + "learning_rate": 9.292092603411641e-05, + "loss": 0.7143, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4305085245494093, + "learning_rate": 9.274856534792138e-05, + "loss": 0.6323, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.4024815591372681, + "learning_rate": 9.257622631561085e-05, + "loss": 0.643, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.39677262680295655, + "learning_rate": 9.240390945181543e-05, + "loss": 0.6761, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.5059510797763671, + "learning_rate": 9.223161527109937e-05, + "loss": 0.731, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.46477522381453795, + "learning_rate": 9.205934428795929e-05, + "loss": 0.6085, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.40850542130652917, + "learning_rate": 9.188709701682247e-05, + "loss": 0.7086, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.45607674638907475, + "learning_rate": 9.171487397204539e-05, + "loss": 0.688, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.3768920463507832, + "learning_rate": 9.154267566791223e-05, + "loss": 0.6476, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.42699732311359867, + "learning_rate": 9.137050261863324e-05, + "loss": 0.6976, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.36609008423564304, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6408, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.3512204967360172, + "learning_rate": 9.102623434110028e-05, + "loss": 0.6572, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.4058479868686318, + "learning_rate": 9.085414014088369e-05, + "loss": 0.6939, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.3748032216832234, + "learning_rate": 9.068207325159284e-05, + "loss": 0.664, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.39182865176607806, + "learning_rate": 9.051003418704565e-05, + "loss": 0.6828, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.38776893265216555, + "learning_rate": 9.033802346097682e-05, + "loss": 0.6336, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.35586013216170675, + "learning_rate": 9.016604158703654e-05, + "loss": 0.6448, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.5289535489633533, + "learning_rate": 8.999408907878877e-05, + "loss": 0.7025, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.37004350986403134, + "learning_rate": 8.982216644970979e-05, + "loss": 0.6477, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4022167263992691, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6628, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.36263873606627256, + "learning_rate": 8.947841288251568e-05, + "loss": 0.6255, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.5169443403628885, + "learning_rate": 8.930658297090091e-05, + "loss": 0.7272, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.5077474735865758, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7211, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.36978250143246966, + "learning_rate": 8.896301945718541e-05, + "loss": 0.6732, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.4332494356774574, + "learning_rate": 8.879128688101749e-05, + "loss": 0.6573, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.41306769252647924, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6417, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.39160267542461225, + "learning_rate": 8.844792265415738e-05, + "loss": 0.6577, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.3823998381292748, + "learning_rate": 8.827629202880293e-05, + "loss": 0.6811, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.3631664497827257, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7202, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.49425879174920706, + "learning_rate": 8.793313631681915e-05, + "loss": 0.6488, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.3697474194740872, + "learning_rate": 8.776161225490489e-05, + "loss": 0.713, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4310295688825336, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6957, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.37156423458479143, + "learning_rate": 8.741867428021446e-05, + "loss": 0.714, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.41859740768728004, + "learning_rate": 8.724726139150318e-05, + "loss": 0.6635, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.39560270417941085, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6528, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.4370306562106364, + "learning_rate": 8.690455037067141e-05, + "loss": 0.6906, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.3922761785579907, + "learning_rate": 8.673325326193806e-05, + "loss": 0.6699, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4442771972537085, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6697, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.3928923028223549, + "learning_rate": 8.639077840543077e-05, + "loss": 0.705, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.4084189852974764, + "learning_rate": 8.621960168033867e-05, + "loss": 0.7075, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.37486857783456, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6281, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.3834285003923, + "learning_rate": 8.587737219227462e-05, + "loss": 0.6921, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.41008495293543173, + "learning_rate": 8.570632045125185e-05, + "loss": 0.679, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.4293866733746269, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6939, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.3722777267141457, + "learning_rate": 8.536434552915556e-05, + "loss": 0.6893, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.37247546020014505, + "learning_rate": 8.519342336927105e-05, + "loss": 0.6587, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.49495381015713114, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6979, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.42177932672113927, + "learning_rate": 8.485171220382545e-05, + "loss": 0.6318, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.4659911613298317, + "learning_rate": 8.468092421866573e-05, + "loss": 0.717, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.38000172119463005, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6755, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.38658253768633705, + "learning_rate": 8.433948599346516e-05, + "loss": 0.6553, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.32209471560397346, + "learning_rate": 8.416883677301069e-05, + "loss": 0.5967, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.37915306478585464, + "learning_rate": 8.399823482681262e-05, + "loss": 0.5964, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.4144123664508527, + "learning_rate": 8.382768066431425e-05, + "loss": 0.722, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.44310885866058436, + "learning_rate": 8.36571747948162e-05, + "loss": 0.663, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4473815955317685, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7348, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.4394755934053004, + "learning_rate": 8.33163099713009e-05, + "loss": 0.7117, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.41061642198296605, + "learning_rate": 8.31459520351578e-05, + "loss": 0.6948, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.40904245927736166, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7221, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.46075497966600326, + "learning_rate": 8.280538765767235e-05, + "loss": 0.6659, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.40916738706197076, + "learning_rate": 8.263518223330697e-05, + "loss": 0.6964, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.43726657778993133, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7611, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.4257493496301308, + "learning_rate": 8.22949274546255e-05, + "loss": 0.644, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.42810911513519423, + "learning_rate": 8.212487911636184e-05, + "loss": 0.6709, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.39822224660334266, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6854, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.4119235536615006, + "learning_rate": 8.178494308093789e-05, + "loss": 0.6241, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.38188802844343217, + "learning_rate": 8.161505639887817e-05, + "loss": 0.6893, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3748132749109451, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6572, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.4070627537552577, + "learning_rate": 8.127544824259889e-05, + "loss": 0.6247, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.4228495901369822, + "learning_rate": 8.110572778250085e-05, + "loss": 0.665, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5125789789195911, + "learning_rate": 8.093606374356759e-05, + "loss": 0.757, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.37762127950207147, + "learning_rate": 8.076645663244168e-05, + "loss": 0.7095, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.45047309222896786, + "learning_rate": 8.059690695559568e-05, + "loss": 0.6102, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.37069355456738307, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6253, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.3832310801872979, + "learning_rate": 8.025798192977481e-05, + "loss": 0.6533, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.3907820180259579, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6768, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.5219383239878169, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6795, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.4223255750226302, + "learning_rate": 7.975003780001485e-05, + "loss": 0.6748, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.34552505395735383, + "learning_rate": 7.958084335506239e-05, + "loss": 0.5693, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.38227932490961664, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6606, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.35869376565014843, + "learning_rate": 7.924263789431912e-05, + "loss": 0.6179, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.4402356169989974, + "learning_rate": 7.907362788846116e-05, + "loss": 0.6694, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.40006983810082125, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6883, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.36870685385884555, + "learning_rate": 7.873579584921869e-05, + "loss": 0.6445, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.4106821402930094, + "learning_rate": 7.856697482465196e-05, + "loss": 0.6718, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.463580435466865, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6595, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.35493664870979025, + "learning_rate": 7.822952528625191e-05, + "loss": 0.5964, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.45086792660255615, + "learning_rate": 7.806089778009421e-05, + "loss": 0.6241, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.3931350665416031, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6916, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.4173050433418133, + "learning_rate": 7.772383981159849e-05, + "loss": 0.6907, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.4500438797061833, + "learning_rate": 7.755541035576677e-05, + "loss": 0.723, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3849060026134618, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6538, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.4572211266887904, + "learning_rate": 7.721875301571359e-05, + "loss": 0.6687, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.43618815957297913, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6553, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.46688255334776935, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6435, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.3769347514358419, + "learning_rate": 7.671427847296275e-05, + "loss": 0.5999, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.5186139577531543, + "learning_rate": 7.654625869212146e-05, + "loss": 0.7119, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.37935317504485616, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6107, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.3638310753856444, + "learning_rate": 7.6210429741257e-05, + "loss": 0.6534, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.3352649331129673, + "learning_rate": 7.604262157407007e-05, + "loss": 0.6271, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.5109608294076012, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7438, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.3952899095104239, + "learning_rate": 7.570722036168854e-05, + "loss": 0.6307, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.42262482407209273, + "learning_rate": 7.55396283180529e-05, + "loss": 0.6796, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.447862424578651, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6812, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.4486983192896598, + "learning_rate": 7.520466385816671e-05, + "loss": 0.7068, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.3510166579154817, + "learning_rate": 7.503729244217086e-05, + "loss": 0.6795, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.4132820453615415, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6577, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.47102117904417895, + "learning_rate": 7.470277373705461e-05, + "loss": 0.7211, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.5029222166630157, + "learning_rate": 7.453562744685778e-05, + "loss": 0.6415, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.37199366289640917, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6991, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.36974073757686976, + "learning_rate": 7.42015634868062e-05, + "loss": 0.7037, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.5839954782570124, + "learning_rate": 7.403464681451715e-05, + "loss": 0.6726, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.43525495380757456, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6936, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.42660667566749433, + "learning_rate": 7.370104657760361e-05, + "loss": 0.6512, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.48853798260934156, + "learning_rate": 7.353436400916004e-05, + "loss": 0.7014, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.40782743015049067, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6324, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.39730932447158884, + "learning_rate": 7.320123646099519e-05, + "loss": 0.6312, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.43644781264951427, + "learning_rate": 7.303479247604332e-05, + "loss": 0.6762, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.4357832284765578, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7199, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.390387285013816, + "learning_rate": 7.270214656953415e-05, + "loss": 0.6117, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.4554194962981507, + "learning_rate": 7.253594564130804e-05, + "loss": 0.7535, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4437873015953556, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6579, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.4099373197208037, + "learning_rate": 7.22037903164173e-05, + "loss": 0.6504, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.5887119870168502, + "learning_rate": 7.203783691161883e-05, + "loss": 0.6521, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.406654003215759, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6003, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.46502921756379767, + "learning_rate": 7.170618109512465e-05, + "loss": 0.7211, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.4815546468614992, + "learning_rate": 7.154047967380354e-05, + "loss": 0.7307, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.45506880792711507, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6761, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.4070671663392784, + "learning_rate": 7.12093322790597e-05, + "loss": 0.6572, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.38718530466773343, + "learning_rate": 7.104388729449338e-05, + "loss": 0.6572, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3867510783801677, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6178, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.3935342723371812, + "learning_rate": 7.071325722118963e-05, + "loss": 0.6705, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.3726697083680594, + "learning_rate": 7.054807311976379e-05, + "loss": 0.6068, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 1.0347332813609864, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6223, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.44285319936485756, + "learning_rate": 7.021796925368667e-05, + "loss": 0.7099, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.41594810747635413, + "learning_rate": 7.005305047477566e-05, + "loss": 0.7445, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.34338966810019267, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6373, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.3756166201718267, + "learning_rate": 6.972348168756983e-05, + "loss": 0.6856, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.4588254867492807, + "learning_rate": 6.955883266341741e-05, + "loss": 0.6636, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4021662128404931, + "learning_rate": 6.939427454121128e-05, + "loss": 0.5913, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.45494546001161157, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7814, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.39549127149064955, + "learning_rate": 6.906543296794714e-05, + "loss": 0.6147, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.5363666722798816, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6595, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.4401617730306245, + "learning_rate": 6.873696089565786e-05, + "loss": 0.7577, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.3863593737160001, + "learning_rate": 6.85728646486359e-05, + "loss": 0.6513, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.44121101952562514, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7003, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.38262277713125686, + "learning_rate": 6.82449541829174e-05, + "loss": 0.6925, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.3983866461238884, + "learning_rate": 6.80811409434113e-05, + "loss": 0.6783, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.3804255660605288, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6602, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.41078728515619706, + "learning_rate": 6.775380089695986e-05, + "loss": 0.6374, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.4598502622667575, + "learning_rate": 6.759027506750158e-05, + "loss": 0.7116, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.4094866119975501, + "learning_rate": 6.742684601840141e-05, + "loss": 0.6947, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.4113188707736787, + "learning_rate": 6.726351423768322e-05, + "loss": 0.72, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.46464533338121966, + "learning_rate": 6.710028021308061e-05, + "loss": 0.7239, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.39169990421551154, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6028, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.378749358223809, + "learning_rate": 6.677410738169485e-05, + "loss": 0.6523, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.4694356959374317, + "learning_rate": 6.661116954891328e-05, + "loss": 0.6835, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.39386387986657706, + "learning_rate": 6.644833142024751e-05, + "loss": 0.5547, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.41022887333091956, + "learning_rate": 6.62855934819569e-05, + "loss": 0.6717, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.4869445304036569, + "learning_rate": 6.612295622000162e-05, + "loss": 0.6458, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.39617360324862894, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6666, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.4461612095821093, + "learning_rate": 6.579798566743314e-05, + "loss": 0.6914, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.360307198191942, + "learning_rate": 6.563565334723134e-05, + "loss": 0.6993, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.41306360228437694, + "learning_rate": 6.547342364418481e-05, + "loss": 0.6044, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.3920470324410973, + "learning_rate": 6.531129704273604e-05, + "loss": 0.6394, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.39942722731479635, + "learning_rate": 6.514927402701964e-05, + "loss": 0.6395, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.36525434652313277, + "learning_rate": 6.498735508086093e-05, + "loss": 0.5635, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.3697333028186257, + "learning_rate": 6.48255406877745e-05, + "loss": 0.6489, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.5034693761597592, + "learning_rate": 6.466383133096267e-05, + "loss": 0.6638, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.38198759421158524, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6039, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.3946903523364556, + "learning_rate": 6.434072965740242e-05, + "loss": 0.6744, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.40744526178631907, + "learning_rate": 6.417933830548467e-05, + "loss": 0.6556, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.3730874608664777, + "learning_rate": 6.40180539194999e-05, + "loss": 0.591, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.3707336373470534, + "learning_rate": 6.385687698106781e-05, + "loss": 0.6409, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.41760664397135294, + "learning_rate": 6.369580797148718e-05, + "loss": 0.603, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.417252907963043, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6289, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.4192323860449515, + "learning_rate": 6.337399566246257e-05, + "loss": 0.6418, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.4427327328342678, + "learning_rate": 6.321325332399903e-05, + "loss": 0.7087, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.5116801317498956, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7527, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.4260104138905186, + "learning_rate": 6.289209867917312e-05, + "loss": 0.6794, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.4665746999686809, + "learning_rate": 6.273168733182722e-05, + "loss": 0.6862, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4279013793908042, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7239, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.3541190181930973, + "learning_rate": 6.241119898233144e-05, + "loss": 0.6048, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.4741478976025896, + "learning_rate": 6.225112293720836e-05, + "loss": 0.7045, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3998080163269796, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6662, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.44602537035538536, + "learning_rate": 6.19313094962673e-05, + "loss": 0.6517, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.37199940613723986, + "learning_rate": 6.177157305546078e-05, + "loss": 0.6568, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.39541468515323636, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7558, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.4990262998205763, + "learning_rate": 6.145244311816063e-05, + "loss": 0.709, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.4283663258149843, + "learning_rate": 6.129305057463741e-05, + "loss": 0.6866, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4000143487971441, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6564, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.42369255064292277, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6963, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.39753999292591025, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.6189, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.37759579166297613, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6918, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.439168186056477, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.6296, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.3384595598430012, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.6401, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4397252068067179, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6843, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.4495587434394545, + "learning_rate": 6.002211118886514e-05, + "loss": 0.686, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.4033178131180434, + "learning_rate": 5.986377600199371e-05, + "loss": 0.6749, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.41966382476424724, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6455, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.5730029679016254, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.682, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.4780240044650562, + "learning_rate": 5.938949144798279e-05, + "loss": 0.6754, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.35299829115160963, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6331, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.43510483087266877, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6175, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.37019265249504896, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.6513, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4344431213608693, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6875, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.3726653750133173, + "learning_rate": 5.860144885064751e-05, + "loss": 0.6514, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.4270131560555474, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.6934, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.4344173959790919, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6117, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.3817477178203144, + "learning_rate": 5.813010299610313e-05, + "loss": 0.6606, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.3946044258169478, + "learning_rate": 5.797323714580192e-05, + "loss": 0.6195, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.5770606292806082, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6672, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.5431755903263947, + "learning_rate": 5.765988240812921e-05, + "loss": 0.7173, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.3757472330097249, + "learning_rate": 5.750339445648252e-05, + "loss": 0.6221, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.40837193866788307, + "learning_rate": 5.73470334061505e-05, + "loss": 0.635, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.49505128852289804, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.6539, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.41076517945799657, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.6393, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.39442404738518705, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6199, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.4155139537955921, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.6536, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.4099148371054838, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.6662, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.36153713831866474, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6329, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.43267639800342794, + "learning_rate": 5.625609846363622e-05, + "loss": 0.6379, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.5193961832474688, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.643, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.3961001135726747, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.633, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.3776635113091816, + "learning_rate": 5.579050500768836e-05, + "loss": 0.5916, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.40026739610815426, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.6106, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.42506452958981844, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6091, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.4767583065398524, + "learning_rate": 5.53260996957381e-05, + "loss": 0.6733, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.3825141222402969, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.6489, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.40932401078675557, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6656, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.4683859535287396, + "learning_rate": 5.486289500882355e-05, + "loss": 0.6306, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.49713749196368245, + "learning_rate": 5.47087624046575e-05, + "loss": 0.6261, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4046846776425327, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6369, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.4226828114463185, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.6508, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.3926616257205702, + "learning_rate": 5.424717791025302e-05, + "loss": 0.6502, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.423404611081691, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6449, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.4535086604738742, + "learning_rate": 5.394013727258254e-05, + "loss": 0.6527, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.40480446195046826, + "learning_rate": 5.378682303724435e-05, + "loss": 0.5772, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.4499097332556954, + "learning_rate": 5.363364680146725e-05, + "loss": 0.661, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.39306632797949126, + "learning_rate": 5.348060902265871e-05, + "loss": 0.6068, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.44711370463148087, + "learning_rate": 5.332771015781275e-05, + "loss": 0.6795, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3948332953812635, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6274, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.4146152458688994, + "learning_rate": 5.302233099590928e-05, + "loss": 0.6675, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.37633458731901637, + "learning_rate": 5.286985161076029e-05, + "loss": 0.6474, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.38096296648560274, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6363, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.451814194484857, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.609, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.4083807036318623, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6431, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4284353582157567, + "learning_rate": 5.226134599488728e-05, + "loss": 0.605, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.4395428406670857, + "learning_rate": 5.210957484346314e-05, + "loss": 0.6975, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.3854301445851761, + "learning_rate": 5.195794670011776e-05, + "loss": 0.6316, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4636660907598242, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6691, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.439135771165176, + "learning_rate": 5.165512124837344e-05, + "loss": 0.6261, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.40163744257019357, + "learning_rate": 5.150392484425728e-05, + "loss": 0.6819, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 1.0646340907060863, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7305, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.4462724535627756, + "learning_rate": 5.120196693701267e-05, + "loss": 0.6636, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.3836630584129171, + "learning_rate": 5.105120633557634e-05, + "loss": 0.6037, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.407531748252144, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6044, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.38400199882654357, + "learning_rate": 5.075012408804458e-05, + "loss": 0.6967, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.45061107663107125, + "learning_rate": 5.059980334102637e-05, + "loss": 0.6063, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4455163000859113, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6501, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.39898912495896044, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.6516, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.37570427299861775, + "learning_rate": 5.014972799220403e-05, + "loss": 0.6154, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.40027078748491896, + "learning_rate": 5.000000000000002e-05, + "loss": 0.624, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.5043252218944329, + "learning_rate": 4.985042131538545e-05, + "loss": 0.7256, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.4508746476798115, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.7035, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.38034510318763615, + "learning_rate": 4.955171365513603e-05, + "loss": 0.5853, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.41062616229759885, + "learning_rate": 4.940258557148765e-05, + "loss": 0.6216, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.389224894970331, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.6734, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.5504549150709207, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6572, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.4841059494145302, + "learning_rate": 4.895610964891923e-05, + "loss": 0.6196, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.3959549009580358, + "learning_rate": 4.880758859890536e-05, + "loss": 0.5784, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.38260545640032856, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6067, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.33628993948453895, + "learning_rate": 4.851100554686021e-05, + "loss": 0.5741, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.46023486039434475, + "learning_rate": 4.836294443047088e-05, + "loss": 0.6257, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4434097007127942, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6798, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.3910731004224021, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.6605, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.4144363824122161, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6141, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3688557031791854, + "learning_rate": 4.777224634018732e-05, + "loss": 0.643, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.4454768883445016, + "learning_rate": 4.762496061632814e-05, + "loss": 0.6393, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.4182386561668562, + "learning_rate": 4.747783129228656e-05, + "loss": 0.6309, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.46168326168634666, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6586, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.4425894286074168, + "learning_rate": 4.718404360058966e-05, + "loss": 0.6884, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.39908913622004444, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.7055, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.3792121179497391, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6024, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.5066824688175311, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.6424, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.3692295300726596, + "learning_rate": 4.659836431497563e-05, + "loss": 0.6723, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.39902602821920813, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6735, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.40162107956838344, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6347, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.3675015493410282, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.6275, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.46216279662287024, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6528, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.4298759543085148, + "learning_rate": 4.586985643347717e-05, + "loss": 0.7145, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.3898675773824648, + "learning_rate": 4.572463804170263e-05, + "loss": 0.6606, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3856297054663061, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6189, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.39208089448633, + "learning_rate": 4.543468791472131e-05, + "loss": 0.6709, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.38246061383981467, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.6356, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.369162140041079, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6162, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.4032891411955045, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.6242, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.42453013494560543, + "learning_rate": 4.485674639850333e-05, + "loss": 0.6939, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.37344003789130603, + "learning_rate": 4.471267160734731e-05, + "loss": 0.5724, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.38728418385000496, + "learning_rate": 4.456876191254582e-05, + "loss": 0.6005, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.41380676225434393, + "learning_rate": 4.442501774383515e-05, + "loss": 0.6631, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4531001232710246, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6574, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.42079095655839355, + "learning_rate": 4.413802770115816e-05, + "loss": 0.6314, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.3954924432799224, + "learning_rate": 4.399478268418771e-05, + "loss": 0.6395, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4647611116985209, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6011, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.4062107993968112, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.6177, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.39643487227860147, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6665, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3740905841064871, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6259, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.4109066238005628, + "learning_rate": 4.328107473805487e-05, + "loss": 0.5959, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.4002381936076298, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.6319, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.3645989491641406, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6716, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.3584276941610215, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.6122, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.4567974782395917, + "learning_rate": 4.271315449981934e-05, + "loss": 0.6032, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.3730897467914509, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6037, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.3578999826745592, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.5955, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.5023846237437409, + "learning_rate": 4.228900904120895e-05, + "loss": 0.6897, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.43293579540195665, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6451, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.35556202301195655, + "learning_rate": 4.200710636738189e-05, + "loss": 0.5978, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.3992386181720559, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.5945, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.40295555105868464, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6423, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.4117021021613468, + "learning_rate": 4.158555222253771e-05, + "loss": 0.6592, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.3599343489939831, + "learning_rate": 4.14453824841132e-05, + "loss": 0.6306, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3660990801052241, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6033, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.3679590577933425, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.5818, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.429818252922498, + "learning_rate": 4.102592405835536e-05, + "loss": 0.6942, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3818104692821199, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6015, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.3736973177830105, + "learning_rate": 4.074716493968975e-05, + "loss": 0.6408, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.4112187825282222, + "learning_rate": 4.060805057932359e-05, + "loss": 0.6375, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.3651825655448954, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6108, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.3597691430568277, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.5925, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.40261787330850607, + "learning_rate": 4.019177327749822e-05, + "loss": 0.6734, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.42290744504416355, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6564, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.37878361366296864, + "learning_rate": 3.991514736790258e-05, + "loss": 0.6349, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.3743736013061357, + "learning_rate": 3.977710334046193e-05, + "loss": 0.596, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.4795105204161491, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7251, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.422585936491689, + "learning_rate": 3.950155520139581e-05, + "loss": 0.639, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.39752662522983573, + "learning_rate": 3.936405191259891e-05, + "loss": 0.6051, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.39881541592941316, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6247, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.3922418850731424, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.6036, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.3634862394883397, + "learning_rate": 3.895263009479534e-05, + "loss": 0.6165, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.35977693427692065, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6347, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.43728907962622665, + "learning_rate": 3.867925968395085e-05, + "loss": 0.6513, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.45508225007313574, + "learning_rate": 3.854284894414122e-05, + "loss": 0.6269, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.4892633215660128, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7104, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.35236445947495665, + "learning_rate": 3.82705784324618e-05, + "loss": 0.63, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.380777365620293, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.6042, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.36454466275990527, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6202, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.4098549804530554, + "learning_rate": 3.786355617847385e-05, + "loss": 0.6482, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.4588665532339407, + "learning_rate": 3.772825265187802e-05, + "loss": 0.6486, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.43242597005494954, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6449, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.3678373825512149, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.6247, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.33858758539495293, + "learning_rate": 3.732345940279893e-05, + "loss": 0.6243, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4283505606342545, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6022, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.46383355745425225, + "learning_rate": 3.705453237352227e-05, + "loss": 0.6582, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.3864518214986883, + "learning_rate": 3.692035060534088e-05, + "loss": 0.6434, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.39866820912672296, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6447, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.40650008293660495, + "learning_rate": 3.665255256532638e-05, + "loss": 0.6116, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.37924622645477385, + "learning_rate": 3.651893709317887e-05, + "loss": 0.6261, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.4342175377696183, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6537, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.4005682284224154, + "learning_rate": 3.625227523958252e-05, + "loss": 0.6126, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.3888776684269537, + "learning_rate": 3.611922965442648e-05, + "loss": 0.6268, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.41947214703791696, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6733, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.4617847420212906, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.6783, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.37886265475351183, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.66, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3562473191738787, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6021, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.483350601891624, + "learning_rate": 3.545687101972013e-05, + "loss": 0.7105, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.36425377392262603, + "learning_rate": 3.53249759200601e-05, + "loss": 0.627, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.42862663832817294, + "learning_rate": 3.519327394983888e-05, + "loss": 0.638, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.41118852381669363, + "learning_rate": 3.506176550233863e-05, + "loss": 0.6834, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.3805524696767454, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6736, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.5077091862940181, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6506, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.4306152496489916, + "learning_rate": 3.46684052203088e-05, + "loss": 0.6736, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.3678045610641948, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.6492, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.39191865227661193, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6401, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.3670367608752555, + "learning_rate": 3.427680074531113e-05, + "loss": 0.632, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.4252473651270437, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.6437, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.45318004841032933, + "learning_rate": 3.401671174289469e-05, + "loss": 0.5808, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.5098946265869971, + "learning_rate": 3.388696260183832e-05, + "loss": 0.676, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.37726528836211715, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.6112, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.36393333240005293, + "learning_rate": 3.362805697728145e-05, + "loss": 0.5759, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.35515819390154585, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.6279, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.6485146562541209, + "learning_rate": 3.336994413891828e-05, + "loss": 0.5874, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.5288235736362663, + "learning_rate": 3.324118597838464e-05, + "loss": 0.722, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.477940679451871, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.6631, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.42816101605739654, + "learning_rate": 3.298426809706928e-05, + "loss": 0.6371, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.385614118024758, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6454, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.4216743611785189, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.6432, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.3854726314225563, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.6107, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.36890568603266516, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6186, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.41984084864865084, + "learning_rate": 3.234548216567049e-05, + "loss": 0.6838, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.42672354262943574, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.6472, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.43571153242149063, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6476, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.47946532001055614, + "learning_rate": 3.196463187590929e-05, + "loss": 0.6546, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.44619839471848427, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.6947, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.35869339050145127, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6402, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.3846343282936006, + "learning_rate": 3.158561005793402e-05, + "loss": 0.635, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.3822661388073339, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6331, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4087421275219027, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.63, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.38026134208371226, + "learning_rate": 3.120842689807468e-05, + "loss": 0.6017, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.44220657991710305, + "learning_rate": 3.108310952230212e-05, + "loss": 0.6339, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.39305850071696574, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6463, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.4115319755731726, + "learning_rate": 3.083309253324651e-05, + "loss": 0.6387, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.4434644719318613, + "learning_rate": 3.070839366655215e-05, + "loss": 0.6322, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.39059993149387545, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6084, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.45522559328114276, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.675, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.40345951732394286, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6051, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.38291898115365075, + "learning_rate": 3.021167106673928e-05, + "loss": 0.575, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.4009439451613568, + "learning_rate": 3.008801048763914e-05, + "loss": 0.6205, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.42740356313299444, + "learning_rate": 2.996455867635155e-05, + "loss": 0.5966, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.42203414312961285, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.615, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.37462441142482394, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6232, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.4453257424581649, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.5901, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4497703092124468, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6283, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.4506692794303581, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6499, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.4163203546144899, + "learning_rate": 2.922825253307947e-05, + "loss": 0.6459, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.399255167598255, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6229, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.455231939617007, + "learning_rate": 2.898450393337977e-05, + "loss": 0.6895, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.4308466172348704, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.6152, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.42234717675112027, + "learning_rate": 2.874160358524931e-05, + "loss": 0.613, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.40401344881413, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.5976, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.453079034843873, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.6798, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.39168209780175084, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.5983, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.4641371617302695, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.6513, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.4854567373155207, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.6578, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3946896301102641, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6225, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.3871040364117289, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.7264, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.5499109389426425, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.7151, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.352017631785061, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.5777, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.39393949674624945, + "learning_rate": 2.753992680872457e-05, + "loss": 0.6065, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.4445275221573401, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6938, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.4912464243057575, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6495, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.40632202592724853, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.5698, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.4169507683934783, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.6843, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.4887464175157345, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.5975, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.38067928789356154, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.6141, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.38185216981431297, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.5865, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.3978552826513095, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6347, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.41171428006598065, + "learning_rate": 2.647690737490106e-05, + "loss": 0.6522, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.44946607999593247, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.6652, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4154439285961479, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6861, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.41163727461473826, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.6541, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.3964474748802533, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6496, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.44965777952677877, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.551, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.4218007844873207, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6031, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.3798275175729113, + "learning_rate": 2.566239608465838e-05, + "loss": 0.6004, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4987480211561112, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6531, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.44030873475463034, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6322, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.39592177686674895, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.5608, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.4296304690593407, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6666, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.3440283545512249, + "learning_rate": 2.508725484101684e-05, + "loss": 0.6247, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.39678078164363756, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.6011, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.41787884337990133, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6511, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.4373438589162138, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.62, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.4285871794267623, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.6149, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.39194465346106083, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6221, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.49852232175056294, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.6959, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.49015137819027743, + "learning_rate": 2.429146201687538e-05, + "loss": 0.6071, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4561759156549268, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6924, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.40457984813967485, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6405, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.4552863080098185, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.6249, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4045385459595967, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6344, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.4872630042726612, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.6896, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.39672074889756576, + "learning_rate": 2.361816641743303e-05, + "loss": 0.5996, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.450988094347731, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6091, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.44983794037069097, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6867, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.41541069931669183, + "learning_rate": 2.328459328616759e-05, + "loss": 0.6059, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.42946789211648667, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6451, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.43525600732114966, + "learning_rate": 2.306335606451181e-05, + "loss": 0.5966, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.437918783066002, + "learning_rate": 2.295308190543859e-05, + "loss": 0.6068, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4314902293258785, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.677, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 1.1126329635651677, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.6036, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.45498482196176615, + "learning_rate": 2.262364118471805e-05, + "loss": 0.6571, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.4298552259283547, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6105, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.4730774388482903, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.7216, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.3903834882469152, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.5768, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4339246885401077, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6352, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.3881044724903212, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6078, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.3557673839680774, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.5902, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.4352593345765754, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.5597, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.43357196154806343, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.6785, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.3669378366091731, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.6085, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4412473289705872, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.5943, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.5208888595319536, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6504, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.4323005556458796, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.6607, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.4108862936123872, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6142, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.4346260375930054, + "learning_rate": 2.111388852214001e-05, + "loss": 0.64, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.40308813177940356, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.5921, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.589901272143973, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7345, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.5024783982715955, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.6399, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.39008869591246187, + "learning_rate": 2.069097260929439e-05, + "loss": 0.5918, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.38403357760120854, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6283, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.3703153151685138, + "learning_rate": 2.048093436450603e-05, + "loss": 0.5995, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.43898842684286316, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.6952, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.3947244506684329, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6212, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.4670396325093309, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.6289, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.4422436470698676, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.6243, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.40660022590354994, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6212, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.4123894521046528, + "learning_rate": 1.985652854842247e-05, + "loss": 0.6577, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.3771549277141085, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.6173, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3849243912963897, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6123, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.4457368375092948, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.6922, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.4284937802894128, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.5886, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.43580633967681354, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6229, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.3753516553746708, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.6282, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.3901069216414992, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.6321, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3808570126535211, + "learning_rate": 1.903740076395151e-05, + "loss": 0.5877, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.3889992683662418, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.6821, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.5578844893113173, + "learning_rate": 1.883503039577894e-05, + "loss": 0.74, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.39448407277793424, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6496, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.43531248342604767, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.6313, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.3958844767624935, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.6081, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.41290102969058634, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6707, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.49966471827528597, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.6603, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.41459538303684473, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.6364, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.46718220105261027, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.653, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.4228244529956391, + "learning_rate": 1.803526775107217e-05, + "loss": 0.6597, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.40558898963210477, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.6829, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4137737033146253, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6164, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.41323312383960475, + "learning_rate": 1.773938710748706e-05, + "loss": 0.6567, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.38724609494835577, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.5872, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.4023325194626976, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6137, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.4319165152042328, + "learning_rate": 1.744571724358789e-05, + "loss": 0.6258, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.38427160535412597, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.6338, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.41824609399104956, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6524, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.3643343854443445, + "learning_rate": 1.715426605184407e-05, + "loss": 0.6074, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.37863486863529766, + "learning_rate": 1.705761004839911e-05, + "loss": 0.6208, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.40498774926576103, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6275, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.4024760095065096, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6499, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.4931732634039112, + "learning_rate": 1.676912926028007e-05, + "loss": 0.5824, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3812008164178614, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6607, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.47719901456654096, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.6999, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.45637998849936456, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.703, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.39587011423510193, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6047, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.3739121533561166, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.6546, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.44386622402899223, + "learning_rate": 1.619888594394382e-05, + "loss": 0.6553, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.5140460706419457, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6769, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.39203232423336304, + "learning_rate": 1.601080376443763e-05, + "loss": 0.6486, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.4117622174758577, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.5995, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.40390414816218734, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6332, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.41077549636484323, + "learning_rate": 1.573056222621453e-05, + "loss": 0.6146, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.4676051434023212, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.6488, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3337707882907331, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.5979, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.38180335470696575, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.5706, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.37409126572075707, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6201, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.41119546624395015, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6301, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.47312142410018665, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6679, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.3972626708375532, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.5996, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.39227793537484773, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6321, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.4076798390203009, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.624, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.41269879748749927, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.6039, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.7960448041645097, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6188, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.4101254671403745, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.5887, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.438443003520308, + "learning_rate": 1.454244833620102e-05, + "loss": 0.6455, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.39712667151133474, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6158, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.36115816596213624, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.5585, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.4329200804299213, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.6209, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.4193382300021258, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6392, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.33435332736478696, + "learning_rate": 1.409693244743192e-05, + "loss": 0.608, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.42534928789770504, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.5901, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4139743745692021, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6503, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.40463453872964994, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.6574, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.43593014759934173, + "learning_rate": 1.37451354812416e-05, + "loss": 0.6661, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.42209842220072213, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.5941, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.49208295342548763, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.6361, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.3927021791281994, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.6627, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.47280659071619807, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7039, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.6259957800318892, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6268, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.39543579754561403, + "learning_rate": 1.322517230541096e-05, + "loss": 0.6407, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.3922214144058078, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.5788, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.38711273483843417, + "learning_rate": 1.30539214797198e-05, + "loss": 0.5876, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.4025097370460652, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.6288, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.35716229127592924, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.5842, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.3991265311076169, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.6152, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.3891308287737353, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.579, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3845123738441986, + "learning_rate": 1.263034245443473e-05, + "loss": 0.5741, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.4231506210111426, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.6283, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.4593327149707268, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6965, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4881232112672962, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6438, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.44925678677877706, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.6114, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.42025807837123813, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.6227, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.42849134787650295, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.601, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.3895228109770154, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.654, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.4835696916394425, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.7137, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.3656139578822936, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.5439, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.42596194837532975, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.6934, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.3999166729778335, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.62, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.42934630139262464, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.645, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.43080016509115493, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.61, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.4732621945870718, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.6043, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.46741353727957474, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6325, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.34396909556193217, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.6357, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.3824092719516941, + "learning_rate": 1.123914688596409e-05, + "loss": 0.6324, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.4018558685092868, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6341, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.3720701038342301, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.5664, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.3973775262116455, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.5774, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.4458687962544333, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6339, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.3754996203130371, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.6895, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.4445078911948587, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.5656, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.4844531254962157, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6585, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.4631582566632992, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.7185, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.41355293965505663, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.5511, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.37894823409900164, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6239, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.3894635070241822, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.6214, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.49077750080918164, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.7006, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.45879660209545653, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6914, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.39586721561275995, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.6312, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.40507398909220027, + "learning_rate": 1.007519208596045e-05, + "loss": 0.6476, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.43893696970881396, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6376, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.4092043057017648, + "learning_rate": 9.924546254786493e-06, + "loss": 0.5815, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.41025146911962923, + "learning_rate": 9.849626695403324e-06, + "loss": 0.6329, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.37822235338995613, + "learning_rate": 9.774976338718677e-06, + "loss": 0.5991, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.548954869584662, + "learning_rate": 9.700595407649805e-06, + "loss": 0.7331, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.41246247504461564, + "learning_rate": 9.62648412430951e-06, + "loss": 0.6008, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.3776720322716301, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6276, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.412600168791054, + "learning_rate": 9.479071385238892e-06, + "loss": 0.581, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.39743555097964806, + "learning_rate": 9.40577036970538e-06, + "loss": 0.5672, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.42723272556656094, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6693, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.3856500310258893, + "learning_rate": 9.259980141081115e-06, + "loss": 0.6425, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.3778877156204836, + "learning_rate": 9.187491363342093e-06, + "loss": 0.5925, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3732515087037962, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6723, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.3783984343604273, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6461, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.5938742034895879, + "learning_rate": 8.971652971536148e-06, + "loss": 0.6332, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.40222798676157995, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6326, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.48395039233064074, + "learning_rate": 8.829119474567671e-06, + "loss": 0.6593, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.42651977915224326, + "learning_rate": 8.758260995011825e-06, + "loss": 0.5677, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4882721676027432, + "learning_rate": 8.687674977138116e-06, + "loss": 0.656, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.3986508067953702, + "learning_rate": 8.617361631727138e-06, + "loss": 0.5809, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.40311993641305327, + "learning_rate": 8.547321168745193e-06, + "loss": 0.571, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.4242736221579273, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6385, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.44687484952940953, + "learning_rate": 8.408059725858719e-06, + "loss": 0.6346, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.4693488072014337, + "learning_rate": 8.338839161809997e-06, + "loss": 0.6168, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.39243857044150077, + "learning_rate": 8.269892311900696e-06, + "loss": 0.5902, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.42414896118706413, + "learning_rate": 8.201219382016556e-06, + "loss": 0.6685, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.4169527469706719, + "learning_rate": 8.132820577225387e-06, + "loss": 0.7182, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.42925948132129943, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6058, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.37048627327545736, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6276, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.4124669750341732, + "learning_rate": 7.929270951805178e-06, + "loss": 0.614, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4103369678456755, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6566, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.3719918884790809, + "learning_rate": 7.794945549701993e-06, + "loss": 0.641, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.36160408673716565, + "learning_rate": 7.728195756009204e-06, + "loss": 0.6149, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.371123351701688, + "learning_rate": 7.661721499929753e-06, + "loss": 0.5774, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.37577075425861084, + "learning_rate": 7.595522979965819e-06, + "loss": 0.5897, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.4220524922249047, + "learning_rate": 7.529600393796232e-06, + "loss": 0.5942, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.4171383655929499, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6207, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.3944743568418283, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.6455, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.519956292062287, + "learning_rate": 7.333490202478666e-06, + "loss": 0.6385, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.44435779218312194, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6475, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.6111002333666475, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6577, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.4053432632644683, + "learning_rate": 7.1398704525792e-06, + "loss": 0.592, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.39684452419339145, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5735, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.4694868727655645, + "learning_rate": 7.012176770311862e-06, + "loss": 0.7099, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.48108357618208775, + "learning_rate": 6.948746347689183e-06, + "loss": 0.6422, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.38441229981223335, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5825, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.4556440972296517, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.642, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.40616369952408726, + "learning_rate": 6.760123024328624e-06, + "loss": 0.6632, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3793897321641102, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6461, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.4203113911725351, + "learning_rate": 6.635765971293484e-06, + "loss": 0.633, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.5910693329538524, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.6138, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.4832743885903782, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6647, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.4005174298622957, + "learning_rate": 6.451321849032288e-06, + "loss": 0.643, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.3873921802193626, + "learning_rate": 6.390398932093555e-06, + "loss": 0.676, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.48315319790454375, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6335, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.3886811729463303, + "learning_rate": 6.269391876739495e-06, + "loss": 0.6104, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.4614013846175197, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6364, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.40131057041768275, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6214, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.4580884350765841, + "learning_rate": 6.089980943839924e-06, + "loss": 0.5913, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.4242718974878402, + "learning_rate": 6.030737921409169e-06, + "loss": 0.5749, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.40212616092166964, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6156, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.4243176189821529, + "learning_rate": 5.913093872058528e-06, + "loss": 0.6396, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.403021958023578, + "learning_rate": 5.854693196441641e-06, + "loss": 0.5815, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.41790627192937685, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6764, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.44462662217109117, + "learning_rate": 5.738735415290642e-06, + "loss": 0.651, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.4498091596916975, + "learning_rate": 5.681178656024055e-06, + "loss": 0.6066, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.5160909094338417, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6899, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.4418308692888828, + "learning_rate": 5.566910259474289e-06, + "loss": 0.6064, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.4296839923775005, + "learning_rate": 5.510198963413881e-06, + "loss": 0.6417, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.3855594591230224, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6506, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.3634407750565033, + "learning_rate": 5.397623022464226e-06, + "loss": 0.5948, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.40767612541599146, + "learning_rate": 5.341758713743828e-06, + "loss": 0.6902, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.48155348442982904, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6122, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.4499236148193907, + "learning_rate": 5.230878253907912e-06, + "loss": 0.6439, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.3735644137361036, + "learning_rate": 5.175862433898282e-06, + "loss": 0.6033, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.474407271254516, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6532, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.3771963682513783, + "learning_rate": 5.066680435123106e-06, + "loss": 0.6001, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.438966461969124, + "learning_rate": 5.012514582391592e-06, + "loss": 0.6232, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4540021611590887, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6328, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.3905382736624076, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6274, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.38257765103487784, + "learning_rate": 4.851719549248301e-06, + "loss": 0.6249, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.3859713396947559, + "learning_rate": 4.798689246727006e-06, + "loss": 0.5994, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.5426179257393545, + "learning_rate": 4.745943229770122e-06, + "loss": 0.6514, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.43028387031700205, + "learning_rate": 4.693481655885257e-06, + "loss": 0.5861, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4246457047582852, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6456, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.5092107153404754, + "learning_rate": 4.58941246311464e-06, + "loss": 0.5961, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.39858739998911535, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6359, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.46706381998750435, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6519, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.43640298644156794, + "learning_rate": 4.435445885824285e-06, + "loss": 0.6849, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.4163289069229634, + "learning_rate": 4.384694230432984e-06, + "loss": 0.647, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3772054625966044, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6088, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.4836480960152808, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.6964, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.3822465242646973, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.597, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.417599387034245, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6582, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.4057234115610521, + "learning_rate": 4.135221781914034e-06, + "loss": 0.7205, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.46278320430493924, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.6431, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.5983770454803956, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7032, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.4542534348324114, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6547, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.38144264321719323, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.6794, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.4448017300662135, + "learning_rate": 3.892905960127546e-06, + "loss": 0.5934, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.48633585978396293, + "learning_rate": 3.845303192289074e-06, + "loss": 0.7325, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.41169580199804856, + "learning_rate": 3.797987556970495e-06, + "loss": 0.5883, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4690029675675022, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6954, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.4186555967581732, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.5946, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.4114249847423077, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.6068, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.446459810055769, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6508, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.40735661424570757, + "learning_rate": 3.565721283350931e-06, + "loss": 0.5591, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.6078548328155869, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.7193, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3991174488262027, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.5677, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.40029407084167784, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6153, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.3859536278935306, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.565, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.4306086879247975, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6185, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.4681143432489656, + "learning_rate": 3.296506110302422e-06, + "loss": 0.6531, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.4494640559550906, + "learning_rate": 3.252646840332918e-06, + "loss": 0.635, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4913072174432582, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6709, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.4117660804615872, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6076, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.3909460342982364, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.6417, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.38730337425406364, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6187, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.45208377709908015, + "learning_rate": 3.037686613916857e-06, + "loss": 0.6333, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.3817534811560179, + "learning_rate": 2.995562691985898e-06, + "loss": 0.6242, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.37087697886372456, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.605, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.39915063942181117, + "learning_rate": 2.912183982969385e-06, + "loss": 0.6565, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.40708734210415587, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.6002, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4986110062360719, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6355, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.3686135121919218, + "learning_rate": 2.789290617426765e-06, + "loss": 0.5633, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.3721249749726393, + "learning_rate": 2.748906571878207e-06, + "loss": 0.6423, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.41943545492108947, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6054, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.43284070259335555, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.6079, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.44396734327680126, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.6188, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.41322427472792067, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6049, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.42906402552795747, + "learning_rate": 2.551344823532964e-06, + "loss": 0.5813, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.4178783262910209, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.5671, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.43260523351026015, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6239, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.3651195153793656, + "learning_rate": 2.436298790049363e-06, + "loss": 0.617, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.44024604737122075, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.6431, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4575895735344115, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6819, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.44772975142293386, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.6679, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.5346957922210491, + "learning_rate": 2.286983355164529e-06, + "loss": 0.6276, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4020209615295135, + "learning_rate": 2.250383684694579e-06, + "loss": 0.5827, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.5126635789939333, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.6318, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.4510740259510137, + "learning_rate": 2.178060137750071e-06, + "loss": 0.6483, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.4026713047765242, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6363, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.4474478889729829, + "learning_rate": 2.106905034576112e-06, + "loss": 0.6043, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.4646913173150557, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.6526, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.42690876968217645, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6381, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.42916131016030773, + "learning_rate": 2.002365067264289e-06, + "loss": 0.6438, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.40725518641858555, + "learning_rate": 1.968103545249611e-06, + "loss": 0.6125, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.3457561401505633, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6245, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.3695374114392267, + "learning_rate": 1.900458817025097e-06, + "loss": 0.5687, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.4447834422895891, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.6677, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.460077593372982, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.592, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.4517788956280518, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.6451, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.394925199205411, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.5707, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.459032469937519, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6741, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.3912866116764875, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.5979, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.47860367523518543, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.6502, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.3975155074917245, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.583, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.3716372371034665, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.5901, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.4859361284510846, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.6439, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.41541400854188787, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.588, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.37724181861323997, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.5866, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.3888776447864694, + "learning_rate": 1.489364501100332e-06, + "loss": 0.5849, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.36143809968939544, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6142, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.41988646763204995, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6423, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.4302947634147954, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.615, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.40408026549902326, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6154, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.3702788572172086, + "learning_rate": 1.344477780953346e-06, + "loss": 0.5793, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.4091136909985783, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.6359, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.43247437071136985, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6648, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.40990498701517686, + "learning_rate": 1.261080262743297e-06, + "loss": 0.6579, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.43275080338022903, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.711, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.4010719626124903, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6459, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.4267113725935911, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.631, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.4035417244970245, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6164, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4940228329440211, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6548, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.5050931635987523, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.6136, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.496454817575372, + "learning_rate": 1.076809502472831e-06, + "loss": 0.6281, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.36726201288264415, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6117, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.45333278650410636, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6164, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.4458064552754602, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6111, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.42360344085642504, + "learning_rate": 9.780089980330642e-07, + "loss": 0.608, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.3697115317534701, + "learning_rate": 9.540479264726676e-07, + "loss": 0.6007, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.44121091021401987, + "learning_rate": 9.303826211592315e-07, + "loss": 0.6, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.38813603165578076, + "learning_rate": 9.070131527609604e-07, + "loss": 0.607, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.37885506588245343, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6216, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.3861215708237205, + "learning_rate": 8.611620049653879e-07, + "loss": 0.5602, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.45455262060354645, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6146, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.40691806764142746, + "learning_rate": 8.16495030759501e-07, + "loss": 0.6232, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.41159366816015297, + "learning_rate": 7.946057760332193e-07, + "loss": 0.5863, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.4694407527138338, + "learning_rate": 7.730127636723539e-07, + "loss": 0.622, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.41801063287095436, + "learning_rate": 7.517160581569372e-07, + "loss": 0.6284, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.42719497482572355, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6371, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.41285117718354075, + "learning_rate": 7.100118211581852e-07, + "loss": 0.663, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.42562719337945415, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6214, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.5395690970992174, + "learning_rate": 6.694935631773258e-07, + "loss": 0.7304, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.41032328674571755, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6573, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.38578810785439155, + "learning_rate": 6.301617681886863e-07, + "loss": 0.625, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.4491101885346572, + "learning_rate": 6.109409416834688e-07, + "loss": 0.6477, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.45385553898315706, + "learning_rate": 5.920169059947411e-07, + "loss": 0.7031, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.4018436943756649, + "learning_rate": 5.733897176325665e-07, + "loss": 0.5933, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.41608012446601444, + "learning_rate": 5.550594322205504e-07, + "loss": 0.6371, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.4294878835451814, + "learning_rate": 5.370261044956971e-07, + "loss": 0.663, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.43563463276347536, + "learning_rate": 5.192897883082747e-07, + "loss": 0.6652, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.6004249075539235, + "learning_rate": 5.018505366216175e-07, + "loss": 0.6469, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4095224316449897, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6075, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.45997622670248234, + "learning_rate": 4.678634341683252e-07, + "loss": 0.6584, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.45250759953032205, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6387, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.44090708330324574, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6431, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.4062126991791442, + "learning_rate": 4.191120373120749e-07, + "loss": 0.621, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.4009031195905101, + "learning_rate": 4.034562351727389e-07, + "loss": 0.6211, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4712789131210934, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6996, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.5029551258999793, + "learning_rate": 3.73036907948543e-07, + "loss": 0.7219, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.4654582549113369, + "learning_rate": 3.582734737004101e-07, + "loss": 0.6843, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4156968107183663, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.5907, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.5084323948101466, + "learning_rate": 3.296392843612273e-07, + "loss": 0.6472, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.4685193906875141, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.6377, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.41999282975670604, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6202, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.4746059190621131, + "learning_rate": 2.889203328748424e-07, + "loss": 0.6289, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.3894929861098244, + "learning_rate": 2.759428007315212e-07, + "loss": 0.6419, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4248066340308863, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6349, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.4379302229320316, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.6643, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.40591424881704585, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.6127, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4795653299529607, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6369, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.40234598685828443, + "learning_rate": 2.15522751523467e-07, + "loss": 0.6332, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.4775141956768784, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.6368, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.41833903955171153, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6183, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.44110044016760425, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.6753, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.4284480343917125, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.6903, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4078046651085013, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.672, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.4991980338518636, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.6064, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.48846284151389696, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.65, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.37930792575343436, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6736, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.3641625378901866, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6127, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.35150803268088227, + "learning_rate": 1.170343437301491e-07, + "loss": 0.5941, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3867345224975431, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6007, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.43264281734236804, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.6525, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.41498583015649615, + "learning_rate": 9.330275400666332e-08, + "loss": 0.6632, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.3836090777226854, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6285, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.3700712115206776, + "learning_rate": 7.8973337634336e-08, + "loss": 0.5941, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.41995580213863215, + "learning_rate": 7.225618800222877e-08, + "loss": 0.6505, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3676612351752429, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5938, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.3788444588538138, + "learning_rate": 5.971710613821291e-08, + "loss": 0.6074, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.4059418675155657, + "learning_rate": 5.389521134989695e-08, + "loss": 0.584, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.39567068196168576, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6367, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.38232772509276153, + "learning_rate": 4.314680098592705e-08, + "loss": 0.6167, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.4651180985905164, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.6676, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.5197233380793199, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6607, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.42387500086006535, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.5764, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.3687634794920366, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.5823, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4116728872012044, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.5885, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.4006186632561352, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.6065, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.38454378122537636, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.6451, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3479609493816106, + "learning_rate": 1.209367398504746e-08, + "loss": 0.5613, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.5098758914604183, + "learning_rate": 9.555535917993297e-09, + "loss": 0.6181, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.47578953791996026, + "learning_rate": 7.315984495548378e-09, + "loss": 0.6215, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.3989169212785679, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6668, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.41939163612008407, + "learning_rate": 3.732667443390181e-09, + "loss": 0.6006, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.4019676815392459, + "learning_rate": 2.388912514017516e-09, + "loss": 0.5955, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.411973876920795, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5836, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.40402033416292676, + "learning_rate": 5.972299119250125e-10, + "loss": 0.6264, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.4217962070916454, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.6045, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.43905748046918114, + "learning_rate": 0.0, + "loss": 0.5911, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1648395321212928.0, + "train_loss": 0.7008718222618103, + "train_runtime": 29232.1086, + "train_samples_per_second": 1.026, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1648395321212928.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2bb36c4f7e4b4040a327d36e4f85b8303dd65dc1 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "up_proj", + "down_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8e2d30fcb7c279e1629e668c1054f6aa396dabe0 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd6c4e8627b9252cbe4113cddd5eba428b94fffef00805e5dd38c407823cddfe +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3adfe91a9dee1d52ca4ba9b0463fe85459763ca --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2589d852b36e90288eade3695331cd6aa472d4777b16091fd0f467268df7c71d +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b60daf4758d2dabe7517094cbd315a70baa9f0fa --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.9530075362191299, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.3589, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.1209233254545106, + "learning_rate": 7.017543859649123e-06, + "loss": 1.5026, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 1.1647674798314256, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5517, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 1.2984126685547293, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.4062, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 1.0496306752362106, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.5315, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9148416499381024, + "learning_rate": 2.105263157894737e-05, + "loss": 1.399, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.8129126710040641, + "learning_rate": 2.456140350877193e-05, + "loss": 1.2746, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.9055627148585419, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.31, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8675338084750814, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1451, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.9104967299301532, + "learning_rate": 3.508771929824561e-05, + "loss": 1.1524, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.8735803073190872, + "learning_rate": 3.859649122807018e-05, + "loss": 1.0964, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8812665889972949, + "learning_rate": 4.210526315789474e-05, + "loss": 1.0851, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.9365546710795096, + "learning_rate": 4.56140350877193e-05, + "loss": 1.0147, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.8118625054570385, + "learning_rate": 4.912280701754386e-05, + "loss": 0.965, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 0.9224027448538653, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0044, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.8240090470074574, + "learning_rate": 5.6140350877192984e-05, + "loss": 1.0274, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.6776694584147641, + "learning_rate": 5.9649122807017544e-05, + "loss": 0.9909, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.5622462135521928, + "learning_rate": 6.31578947368421e-05, + "loss": 0.8621, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.6494376384802008, + "learning_rate": 6.666666666666667e-05, + "loss": 0.9268, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.4473607453727907, + "learning_rate": 7.017543859649122e-05, + "loss": 0.7989, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5970208228748919, + "learning_rate": 7.368421052631579e-05, + "loss": 0.8776, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.5330213047566539, + "learning_rate": 7.719298245614036e-05, + "loss": 0.8361, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.5647289108548379, + "learning_rate": 8.070175438596491e-05, + "loss": 0.8336, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5154584725991561, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9474, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.6485272082665403, + "learning_rate": 8.771929824561403e-05, + "loss": 0.9765, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.5381159594051211, + "learning_rate": 9.12280701754386e-05, + "loss": 0.8683, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5204362272884198, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8516, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.640201554627158, + "learning_rate": 9.824561403508771e-05, + "loss": 0.9099, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5651120390420362, + "learning_rate": 0.0001017543859649123, + "loss": 0.8992, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.5202204909426368, + "learning_rate": 0.00010526315789473685, + "loss": 0.8774, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.5210698306695373, + "learning_rate": 0.00010877192982456141, + "loss": 0.8541, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.618852912966987, + "learning_rate": 0.00011228070175438597, + "loss": 0.8841, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5421728477298867, + "learning_rate": 0.00011578947368421053, + "loss": 0.8934, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.5290161885606602, + "learning_rate": 0.00011929824561403509, + "loss": 0.9035, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.8211442273789911, + "learning_rate": 0.00012280701754385965, + "loss": 0.9033, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5324475940959741, + "learning_rate": 0.0001263157894736842, + "loss": 0.8117, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.46451991320560043, + "learning_rate": 0.0001298245614035088, + "loss": 0.8473, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.4599769992852389, + "learning_rate": 0.00013333333333333334, + "loss": 0.8296, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5242429491517732, + "learning_rate": 0.0001368421052631579, + "loss": 0.8684, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.5175210915839688, + "learning_rate": 0.00014035087719298245, + "loss": 0.9087, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.4500331694359257, + "learning_rate": 0.00014385964912280703, + "loss": 0.8495, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5223537308179806, + "learning_rate": 0.00014736842105263158, + "loss": 0.8686, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.48999253255749625, + "learning_rate": 0.00015087719298245616, + "loss": 0.7974, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.5282601459712972, + "learning_rate": 0.0001543859649122807, + "loss": 0.851, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.4901323293268392, + "learning_rate": 0.00015789473684210527, + "loss": 0.8303, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.5134194007789946, + "learning_rate": 0.00016140350877192982, + "loss": 0.9225, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.5514875605483572, + "learning_rate": 0.0001649122807017544, + "loss": 0.862, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5473890815611594, + "learning_rate": 0.00016842105263157895, + "loss": 0.7422, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.5145757379137041, + "learning_rate": 0.00017192982456140353, + "loss": 0.8397, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.5882197077979645, + "learning_rate": 0.00017543859649122806, + "loss": 0.8466, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.48284574637139754, + "learning_rate": 0.00017894736842105264, + "loss": 0.7566, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.49211571789559033, + "learning_rate": 0.0001824561403508772, + "loss": 0.7974, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.52322891431274, + "learning_rate": 0.00018596491228070177, + "loss": 0.8635, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4501147985707925, + "learning_rate": 0.00018947368421052632, + "loss": 0.8766, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.48922557158979185, + "learning_rate": 0.00019298245614035088, + "loss": 0.8648, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.546341643940991, + "learning_rate": 0.00019649122807017543, + "loss": 0.8634, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.47936360056551997, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.45765672456621115, + "learning_rate": 0.00019999985069241055, + "loss": 0.8067, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.5118061525014858, + "learning_rate": 0.00019999940277008808, + "loss": 0.7969, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.5145855756469134, + "learning_rate": 0.00019999865623437013, + "loss": 0.8368, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.4784625296076965, + "learning_rate": 0.00019999761108748597, + "loss": 0.8651, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.46151113581587216, + "learning_rate": 0.00019999626733255662, + "loss": 0.8007, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.49175484535289365, + "learning_rate": 0.00019999462497359466, + "loss": 0.8093, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.4074528589017721, + "learning_rate": 0.00019999268401550447, + "loss": 0.7676, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.43776886250252567, + "learning_rate": 0.000199990444464082, + "loss": 0.8463, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5009560087554507, + "learning_rate": 0.00019998790632601496, + "loss": 0.8322, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.4846580960169461, + "learning_rate": 0.00019998506960888256, + "loss": 0.7734, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5340418592386688, + "learning_rate": 0.00019998193432115572, + "loss": 0.746, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5775043682417678, + "learning_rate": 0.0001999785004721968, + "loss": 0.8457, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.5016677566081483, + "learning_rate": 0.00019997476807225985, + "loss": 0.8604, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.4397189251517687, + "learning_rate": 0.0001999707371324904, + "loss": 0.8256, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4900884380603229, + "learning_rate": 0.00019996640766492543, + "loss": 0.8567, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.4580109565797004, + "learning_rate": 0.00019996177968249334, + "loss": 0.7163, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.4586077587006553, + "learning_rate": 0.0001999568531990141, + "loss": 0.7916, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.4627256679086634, + "learning_rate": 0.00019995162822919883, + "loss": 0.7894, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.4988403345065922, + "learning_rate": 0.00019994610478865011, + "loss": 0.8367, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.5236286527623206, + "learning_rate": 0.0001999402828938618, + "loss": 0.8507, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.6922765686729557, + "learning_rate": 0.00019993416256221895, + "loss": 0.8996, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.4783234228874865, + "learning_rate": 0.00019992774381199778, + "loss": 0.7708, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.612578763748735, + "learning_rate": 0.00019992102666236566, + "loss": 0.7545, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.46203764584243073, + "learning_rate": 0.00019991401113338104, + "loss": 0.7887, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.45244911162443047, + "learning_rate": 0.00019990669724599336, + "loss": 0.7744, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.6566375324709214, + "learning_rate": 0.00019989908502204292, + "loss": 0.833, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5194199924365458, + "learning_rate": 0.00019989117448426108, + "loss": 0.8446, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.45945184735452566, + "learning_rate": 0.00019988296565626987, + "loss": 0.8348, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.46089092617175464, + "learning_rate": 0.00019987445856258206, + "loss": 0.7633, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.47514977024471644, + "learning_rate": 0.00019986565322860115, + "loss": 0.8203, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.6291425538885136, + "learning_rate": 0.00019985654968062122, + "loss": 0.7998, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.549901839877944, + "learning_rate": 0.00019984714794582683, + "loss": 0.8657, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.46661448203093603, + "learning_rate": 0.00019983744805229296, + "loss": 0.8133, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.459206129392748, + "learning_rate": 0.000199827450028985, + "loss": 0.8299, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.5623178864601807, + "learning_rate": 0.00019981715390575858, + "loss": 0.8385, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.45348731897178707, + "learning_rate": 0.00019980655971335945, + "loss": 0.7574, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.535542349820639, + "learning_rate": 0.00019979566748342347, + "loss": 0.8661, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.5022061293461977, + "learning_rate": 0.00019978447724847652, + "loss": 0.7923, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5202146438079589, + "learning_rate": 0.00019977298904193437, + "loss": 0.8313, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.4201263667221269, + "learning_rate": 0.00019976120289810247, + "loss": 0.8469, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.48233205307936633, + "learning_rate": 0.00019974911885217608, + "loss": 0.8151, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4671449281242293, + "learning_rate": 0.00019973673694024, + "loss": 0.8306, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.48299374492785263, + "learning_rate": 0.0001997240571992685, + "loss": 0.8501, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.48753971529645357, + "learning_rate": 0.00019971107966712518, + "loss": 0.7638, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4674860574247839, + "learning_rate": 0.00019969780438256293, + "loss": 0.7976, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.6491059212719197, + "learning_rate": 0.0001996842313852238, + "loss": 0.8005, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.4415398829830088, + "learning_rate": 0.00019967036071563877, + "loss": 0.7475, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.5024903122111678, + "learning_rate": 0.0001996561924152278, + "loss": 0.7964, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.5094836435189203, + "learning_rate": 0.0001996417265262996, + "loss": 0.7924, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.5046458098684509, + "learning_rate": 0.00019962696309205148, + "loss": 0.8139, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5319455233865822, + "learning_rate": 0.0001996119021565693, + "loss": 0.895, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.5553777319473407, + "learning_rate": 0.0001995965437648273, + "loss": 0.8232, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.43313878035714637, + "learning_rate": 0.00019958088796268793, + "loss": 0.7928, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4831267278735094, + "learning_rate": 0.0001995649347969019, + "loss": 0.7515, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.5429097406906558, + "learning_rate": 0.00019954868431510764, + "loss": 0.8682, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.44352321291184216, + "learning_rate": 0.00019953213656583168, + "loss": 0.8025, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4945646959168046, + "learning_rate": 0.00019951529159848805, + "loss": 0.8142, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.48152743089150607, + "learning_rate": 0.00019949814946337838, + "loss": 0.8533, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.5239290869484868, + "learning_rate": 0.00019948071021169174, + "loss": 0.8501, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.5108716008294788, + "learning_rate": 0.00019946297389550433, + "loss": 0.8221, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.5882597720261666, + "learning_rate": 0.00019944494056777946, + "loss": 0.7737, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.5275886823336764, + "learning_rate": 0.00019942661028236745, + "loss": 0.7448, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.4512469637764529, + "learning_rate": 0.00019940798309400526, + "loss": 0.752, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.6026573460803222, + "learning_rate": 0.00019938905905831654, + "loss": 0.8495, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.4621277101738424, + "learning_rate": 0.00019936983823181132, + "loss": 0.7959, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5248664374789274, + "learning_rate": 0.0001993503206718859, + "loss": 0.8501, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.5040839994674604, + "learning_rate": 0.00019933050643682269, + "loss": 0.7992, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.4824258987952713, + "learning_rate": 0.00019931039558578997, + "loss": 0.7573, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.44725470330345835, + "learning_rate": 0.00019928998817884182, + "loss": 0.7947, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.47105156541524995, + "learning_rate": 0.00019926928427691786, + "loss": 0.8229, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.4933021592132359, + "learning_rate": 0.00019924828394184306, + "loss": 0.8778, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.45689750144583013, + "learning_rate": 0.00019922698723632767, + "loss": 0.7978, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.4567165648731524, + "learning_rate": 0.0001992053942239668, + "loss": 0.8177, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.5683715743251585, + "learning_rate": 0.0001991835049692405, + "loss": 0.8763, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.44138942802531556, + "learning_rate": 0.00019916131953751342, + "loss": 0.737, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.5293349882470612, + "learning_rate": 0.0001991388379950346, + "loss": 0.7573, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.55464443039697, + "learning_rate": 0.0001991160604089374, + "loss": 0.879, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.5069743884549222, + "learning_rate": 0.00019909298684723904, + "loss": 0.7757, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.4737485373103175, + "learning_rate": 0.00019906961737884077, + "loss": 0.7917, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.4426742568055296, + "learning_rate": 0.00019904595207352737, + "loss": 0.7931, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4836372167456052, + "learning_rate": 0.00019902199100196697, + "loss": 0.7804, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.649942284751794, + "learning_rate": 0.000198997734235711, + "loss": 0.7834, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.4783968078909489, + "learning_rate": 0.00019897318184719385, + "loss": 0.7234, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.5941916056960217, + "learning_rate": 0.00019894833390973266, + "loss": 0.8439, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.5213384394980335, + "learning_rate": 0.0001989231904975272, + "loss": 0.7826, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.4231337229376453, + "learning_rate": 0.00019889775168565943, + "loss": 0.7664, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.497995175665305, + "learning_rate": 0.00019887201755009357, + "loss": 0.8113, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.40749924047351943, + "learning_rate": 0.00019884598816767563, + "loss": 0.7408, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.4534240521727463, + "learning_rate": 0.0001988196636161333, + "loss": 0.7769, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4344782410376436, + "learning_rate": 0.0001987930439740757, + "loss": 0.7083, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.4854700699850418, + "learning_rate": 0.00019876612932099308, + "loss": 0.8423, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.48673791357084556, + "learning_rate": 0.0001987389197372567, + "loss": 0.824, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.40601082970046964, + "learning_rate": 0.00019871141530411853, + "loss": 0.7531, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.45113518119875684, + "learning_rate": 0.00019868361610371097, + "loss": 0.7536, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.4554053402986199, + "learning_rate": 0.00019865552221904665, + "loss": 0.731, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4662146968116339, + "learning_rate": 0.0001986271337340182, + "loss": 0.7935, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.4695700873307507, + "learning_rate": 0.00019859845073339787, + "loss": 0.8181, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.5550879597141647, + "learning_rate": 0.00019856947330283752, + "loss": 0.8821, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.47521271819956, + "learning_rate": 0.00019854020152886814, + "loss": 0.7992, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.47136209486211345, + "learning_rate": 0.0001985106354988997, + "loss": 0.8273, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.43887650969681197, + "learning_rate": 0.00019848077530122083, + "loss": 0.779, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5143492554378482, + "learning_rate": 0.0001984506210249986, + "loss": 0.7945, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.44170754373898863, + "learning_rate": 0.00019842017276027832, + "loss": 0.8181, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.45954856452138815, + "learning_rate": 0.00019838943059798304, + "loss": 0.7921, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.48147848641435625, + "learning_rate": 0.00019835839462991361, + "loss": 0.8148, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.44842730592611174, + "learning_rate": 0.0001983270649487481, + "loss": 0.7334, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.4508095645687276, + "learning_rate": 0.0001982954416480417, + "loss": 0.7911, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.46608885480827533, + "learning_rate": 0.00019826352482222638, + "loss": 0.8108, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.45877070958739413, + "learning_rate": 0.00019823131456661063, + "loss": 0.7783, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.4304921622882453, + "learning_rate": 0.00019819881097737915, + "loss": 0.7841, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4366462173180608, + "learning_rate": 0.00019816601415159263, + "loss": 0.7464, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.43983667199720133, + "learning_rate": 0.00019813292418718732, + "loss": 0.7576, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.41920982657832667, + "learning_rate": 0.0001980995411829749, + "loss": 0.7612, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5046837642313663, + "learning_rate": 0.0001980658652386421, + "loss": 0.6914, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.5634990573175862, + "learning_rate": 0.0001980318964547504, + "loss": 0.798, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.4873656252594583, + "learning_rate": 0.0001979976349327357, + "loss": 0.8175, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.47219333626555077, + "learning_rate": 0.00019796308077490817, + "loss": 0.8465, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.46148219457852263, + "learning_rate": 0.00019792823408445174, + "loss": 0.792, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.41285793437442386, + "learning_rate": 0.0001978930949654239, + "loss": 0.6803, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.5131710262594903, + "learning_rate": 0.00019785766352275542, + "loss": 0.8314, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.4061201541559843, + "learning_rate": 0.00019782193986224995, + "loss": 0.795, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.6144398063126736, + "learning_rate": 0.00019778592409058378, + "loss": 0.8947, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.48165864832622596, + "learning_rate": 0.00019774961631530545, + "loss": 0.7788, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.5924083535442879, + "learning_rate": 0.0001977130166448355, + "loss": 0.7752, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.4606648845819176, + "learning_rate": 0.00019767612518846608, + "loss": 0.8138, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.42228257086642856, + "learning_rate": 0.00019763894205636072, + "loss": 0.797, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.4761085936785347, + "learning_rate": 0.00019760146735955388, + "loss": 0.7721, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.6890945548578655, + "learning_rate": 0.00019756370120995066, + "loss": 0.9284, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4251932131230442, + "learning_rate": 0.00019752564372032657, + "loss": 0.7526, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.47175939719829446, + "learning_rate": 0.000197487295004327, + "loss": 0.8705, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.40835561208500687, + "learning_rate": 0.00019744865517646706, + "loss": 0.7245, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.46055583539677014, + "learning_rate": 0.00019740972435213115, + "loss": 0.8185, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.3937627480015215, + "learning_rate": 0.0001973705026475726, + "loss": 0.7648, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.5247079534589798, + "learning_rate": 0.00019733099017991341, + "loss": 0.8725, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.39572320717463216, + "learning_rate": 0.00019729118706714375, + "loss": 0.8015, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.38594413201314715, + "learning_rate": 0.0001972510934281218, + "loss": 0.7124, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.47713898115659037, + "learning_rate": 0.00019721070938257324, + "loss": 0.7909, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.4402783065445849, + "learning_rate": 0.00019717003505109095, + "loss": 0.7051, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.4810832587373293, + "learning_rate": 0.0001971290705551347, + "loss": 0.7395, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.49793847135263325, + "learning_rate": 0.00019708781601703065, + "loss": 0.8246, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5163620957941052, + "learning_rate": 0.00019704627155997108, + "loss": 0.7877, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.47559073746323993, + "learning_rate": 0.00019700443730801413, + "loss": 0.7746, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.5135088027783049, + "learning_rate": 0.00019696231338608316, + "loss": 0.784, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.50758928100271, + "learning_rate": 0.00019691989991996663, + "loss": 0.7726, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.49973486643233367, + "learning_rate": 0.00019687719703631755, + "loss": 0.7804, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.49498188569913276, + "learning_rate": 0.00019683420486265327, + "loss": 0.7357, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4848853436284693, + "learning_rate": 0.0001967909235273549, + "loss": 0.7357, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.6515538005832042, + "learning_rate": 0.0001967473531596671, + "loss": 0.7494, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.4230006960565803, + "learning_rate": 0.0001967034938896976, + "loss": 0.7791, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5197799537090696, + "learning_rate": 0.00019665934584841682, + "loss": 0.8335, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.5693892249171181, + "learning_rate": 0.0001966149091676575, + "loss": 0.7894, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.5143451510978614, + "learning_rate": 0.00019657018398011434, + "loss": 0.7676, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.563024349286294, + "learning_rate": 0.00019652517041934356, + "loss": 0.8176, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.4834689700808916, + "learning_rate": 0.00019647986861976246, + "loss": 0.8119, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.47788521778441795, + "learning_rate": 0.0001964342787166491, + "loss": 0.7896, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4739362949248575, + "learning_rate": 0.00019638840084614182, + "loss": 0.8031, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.4557070068611579, + "learning_rate": 0.0001963422351452389, + "loss": 0.7562, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.5123175930425836, + "learning_rate": 0.0001962957817517982, + "loss": 0.7483, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5098661037143992, + "learning_rate": 0.00019624904080453655, + "loss": 0.8108, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.5360911309354575, + "learning_rate": 0.00019620201244302952, + "loss": 0.7999, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.4940892054415995, + "learning_rate": 0.00019615469680771096, + "loss": 0.7901, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.5224086792151261, + "learning_rate": 0.00019610709403987246, + "loss": 0.8482, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.4732769215032251, + "learning_rate": 0.00019605920428166323, + "loss": 0.7775, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.4808299578109734, + "learning_rate": 0.00019601102767608923, + "loss": 0.8253, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4802019444834149, + "learning_rate": 0.00019596256436701324, + "loss": 0.8379, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.5681970582081078, + "learning_rate": 0.00019591381449915397, + "loss": 0.8801, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.5489033879490961, + "learning_rate": 0.00019586477821808597, + "loss": 0.7569, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.5026972332593719, + "learning_rate": 0.000195815455670239, + "loss": 0.6953, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.518880954784583, + "learning_rate": 0.00019576584700289768, + "loss": 0.8101, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.45054964365051964, + "learning_rate": 0.00019571595236420102, + "loss": 0.7742, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.459553611277924, + "learning_rate": 0.00019566577190314197, + "loss": 0.741, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.5392204029433525, + "learning_rate": 0.00019561530576956703, + "loss": 0.7943, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.46763358522659854, + "learning_rate": 0.00019556455411417573, + "loss": 0.7506, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.48796001912296344, + "learning_rate": 0.0001955135170885202, + "loss": 0.7885, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.41612431290378915, + "learning_rate": 0.00019546219484500475, + "loss": 0.7818, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.4170485922941037, + "learning_rate": 0.00019541058753688538, + "loss": 0.8096, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.40575526010535407, + "learning_rate": 0.00019535869531826937, + "loss": 0.7689, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.43631370693190186, + "learning_rate": 0.00019530651834411474, + "loss": 0.7512, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.4549159681585235, + "learning_rate": 0.00019525405677022989, + "loss": 0.8294, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.47339024015309783, + "learning_rate": 0.00019520131075327298, + "loss": 0.846, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.43902206036413266, + "learning_rate": 0.0001951482804507517, + "loss": 0.7381, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.46220739725013227, + "learning_rate": 0.00019509496602102252, + "loss": 0.7356, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.5516968267415471, + "learning_rate": 0.00019504136762329047, + "loss": 0.8354, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.48050633352830363, + "learning_rate": 0.00019498748541760846, + "loss": 0.8231, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.4853375372865061, + "learning_rate": 0.0001949333195648769, + "loss": 0.8271, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.44556603279849505, + "learning_rate": 0.00019487887022684336, + "loss": 0.8374, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.4719602233654268, + "learning_rate": 0.00019482413756610173, + "loss": 0.7445, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.4061586873518868, + "learning_rate": 0.0001947691217460921, + "loss": 0.7623, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4364473795948971, + "learning_rate": 0.00019471382293110003, + "loss": 0.7568, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.5127348804567465, + "learning_rate": 0.00019465824128625617, + "loss": 0.7951, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.4370678397395421, + "learning_rate": 0.00019460237697753577, + "loss": 0.7307, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4802652726484155, + "learning_rate": 0.00019454623017175812, + "loss": 0.7772, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4716996473593707, + "learning_rate": 0.00019448980103658613, + "loss": 0.8224, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.45179297458376827, + "learning_rate": 0.0001944330897405257, + "loss": 0.7197, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.6256760091274871, + "learning_rate": 0.00019437609645292546, + "loss": 0.8568, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.40652398308119203, + "learning_rate": 0.00019431882134397598, + "loss": 0.7612, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.39826248207703735, + "learning_rate": 0.00019426126458470936, + "loss": 0.697, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.5388984002348581, + "learning_rate": 0.0001942034263469989, + "loss": 0.7699, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.4894846768153503, + "learning_rate": 0.00019414530680355837, + "loss": 0.8185, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.47452035013179233, + "learning_rate": 0.00019408690612794148, + "loss": 0.7656, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4638104459253293, + "learning_rate": 0.00019402822449454153, + "loss": 0.7817, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.48208730761737223, + "learning_rate": 0.00019396926207859084, + "loss": 0.764, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.404761226193334, + "learning_rate": 0.0001939100190561601, + "loss": 0.6982, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.4453259656128742, + "learning_rate": 0.00019385049560415794, + "loss": 0.8025, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.5874041451444244, + "learning_rate": 0.0001937906919003304, + "loss": 0.7778, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.48081228484114513, + "learning_rate": 0.00019373060812326052, + "loss": 0.834, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4489761686206763, + "learning_rate": 0.00019367024445236754, + "loss": 0.7639, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.44121347639841846, + "learning_rate": 0.00019360960106790643, + "loss": 0.7254, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.5458503844312422, + "learning_rate": 0.0001935486781509677, + "loss": 0.796, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4292348510426145, + "learning_rate": 0.00019348747588347637, + "loss": 0.7528, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.5601307983573015, + "learning_rate": 0.00019342599444819168, + "loss": 0.7965, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.4661341927779982, + "learning_rate": 0.00019336423402870653, + "loss": 0.8204, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.44290954581930936, + "learning_rate": 0.00019330219480944694, + "loss": 0.7235, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.5217928965079981, + "learning_rate": 0.0001932398769756714, + "loss": 0.7719, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.515952008050984, + "learning_rate": 0.0001931772807134704, + "loss": 0.8093, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4707600737870565, + "learning_rate": 0.00019311440620976597, + "loss": 0.7332, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.45512111329432015, + "learning_rate": 0.00019305125365231084, + "loss": 0.7657, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.49386683069536635, + "learning_rate": 0.00019298782322968815, + "loss": 0.7748, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5297188316292964, + "learning_rate": 0.0001929241151313108, + "loss": 0.8383, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.4755481145022749, + "learning_rate": 0.0001928601295474208, + "loss": 0.8238, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.49557700110676095, + "learning_rate": 0.00019279586666908884, + "loss": 0.7817, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.46635936642688935, + "learning_rate": 0.00019273132668821364, + "loss": 0.7978, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.4289128821422162, + "learning_rate": 0.00019266650979752136, + "loss": 0.7138, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.48738314609520644, + "learning_rate": 0.00019260141619056507, + "loss": 0.7596, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4494836479899092, + "learning_rate": 0.00019253604606172417, + "loss": 0.7568, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.45960464062943973, + "learning_rate": 0.0001924703996062038, + "loss": 0.7752, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.43800030980808136, + "learning_rate": 0.0001924044770200342, + "loss": 0.7668, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.5008451762030025, + "learning_rate": 0.00019233827850007027, + "loss": 0.8096, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.4448951923072958, + "learning_rate": 0.0001922718042439908, + "loss": 0.7807, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.5348016616275901, + "learning_rate": 0.000192205054450298, + "loss": 0.8275, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4475826890866102, + "learning_rate": 0.00019213802931831696, + "loss": 0.8464, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.4516093386718875, + "learning_rate": 0.00019207072904819486, + "loss": 0.7338, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.423419919326598, + "learning_rate": 0.00019200315384090044, + "loss": 0.7429, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.40929044163186284, + "learning_rate": 0.00019193530389822363, + "loss": 0.7629, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.3915640933466481, + "learning_rate": 0.00019186717942277462, + "loss": 0.6816, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.45028124322231367, + "learning_rate": 0.00019179878061798347, + "loss": 0.7466, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.44664051619568207, + "learning_rate": 0.00019173010768809933, + "loss": 0.7082, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.4318842557767651, + "learning_rate": 0.00019166116083819002, + "loss": 0.7901, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.419089019477868, + "learning_rate": 0.00019159194027414128, + "loss": 0.7535, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4746626817050296, + "learning_rate": 0.0001915224462026563, + "loss": 0.8016, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.5561412121845948, + "learning_rate": 0.00019145267883125482, + "loss": 0.7921, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.48779717445322235, + "learning_rate": 0.00019138263836827288, + "loss": 0.8115, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.5179134634333581, + "learning_rate": 0.00019131232502286188, + "loss": 0.8573, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.5164662637840521, + "learning_rate": 0.00019124173900498818, + "loss": 0.7585, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.5557415194491732, + "learning_rate": 0.00019117088052543233, + "loss": 0.7639, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.45507954671227385, + "learning_rate": 0.0001910997497957885, + "loss": 0.6824, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.5328486021774697, + "learning_rate": 0.00019102834702846387, + "loss": 0.7466, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.466952483557593, + "learning_rate": 0.0001909566724366779, + "loss": 0.7895, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5050910069933272, + "learning_rate": 0.00019088472623446183, + "loss": 0.8058, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.4437035269566679, + "learning_rate": 0.00019081250863665794, + "loss": 0.8091, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.4149488344882996, + "learning_rate": 0.0001907400198589189, + "loss": 0.7001, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.41420753794845466, + "learning_rate": 0.00019066726011770726, + "loss": 0.7069, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.4224940665853059, + "learning_rate": 0.00019059422963029464, + "loss": 0.7945, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.40145509406901553, + "learning_rate": 0.0001905209286147611, + "loss": 0.746, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4732434289419564, + "learning_rate": 0.0001904473572899947, + "loss": 0.8353, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.4229366424040278, + "learning_rate": 0.0001903735158756905, + "loss": 0.8353, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.447666319979107, + "learning_rate": 0.0001902994045923502, + "loss": 0.7846, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.4076824839696985, + "learning_rate": 0.00019022502366128135, + "loss": 0.7914, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.5306119733066779, + "learning_rate": 0.0001901503733045967, + "loss": 0.8472, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.43930612065109, + "learning_rate": 0.00019007545374521355, + "loss": 0.7688, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.48951247411124604, + "learning_rate": 0.00019000026520685302, + "loss": 0.7647, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.39437844060981536, + "learning_rate": 0.00018992480791403958, + "loss": 0.7146, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.5106490024266829, + "learning_rate": 0.0001898490820921001, + "loss": 0.7673, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4554149817154902, + "learning_rate": 0.0001897730879671634, + "loss": 0.7742, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.4128434656129718, + "learning_rate": 0.0001896968257661595, + "loss": 0.6839, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.4365532500068166, + "learning_rate": 0.00018962029571681886, + "loss": 0.7491, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5404820208818948, + "learning_rate": 0.00018954349804767184, + "loss": 0.8756, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.4026796553720846, + "learning_rate": 0.00018946643298804793, + "loss": 0.6692, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.43593661521496024, + "learning_rate": 0.00018938910076807513, + "loss": 0.6858, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.46833989079793203, + "learning_rate": 0.00018931150161867916, + "loss": 0.7437, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.4638802297033959, + "learning_rate": 0.0001892336357715829, + "loss": 0.7714, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.44901973665971945, + "learning_rate": 0.0001891555034593055, + "loss": 0.767, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.40021188967521376, + "learning_rate": 0.00018907710491516199, + "loss": 0.6805, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.47974999968855614, + "learning_rate": 0.00018899844037326225, + "loss": 0.7806, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.4905673523982542, + "learning_rate": 0.0001889195100685106, + "loss": 0.8136, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.40865028043523793, + "learning_rate": 0.0001888403142366049, + "loss": 0.7654, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.6085964488367325, + "learning_rate": 0.00018876085311403593, + "loss": 0.8238, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.49235639869921055, + "learning_rate": 0.00018868112693808665, + "loss": 0.7707, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.42159691986144743, + "learning_rate": 0.00018860113594683148, + "loss": 0.756, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.4395918565248622, + "learning_rate": 0.00018852088037913577, + "loss": 0.7879, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.4415135411457727, + "learning_rate": 0.0001884403604746547, + "loss": 0.7335, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5298497796094817, + "learning_rate": 0.00018835957647383303, + "loss": 0.7892, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.4095062084473622, + "learning_rate": 0.00018827852861790398, + "loss": 0.7013, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.4166526105460935, + "learning_rate": 0.00018819721714888877, + "loss": 0.7216, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4380297982805859, + "learning_rate": 0.00018811564230959588, + "loss": 0.6844, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.5279780310929358, + "learning_rate": 0.00018803380434362, + "loss": 0.7763, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.490170387095156, + "learning_rate": 0.0001879517034953418, + "loss": 0.6931, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.49544220794163213, + "learning_rate": 0.00018786934000992688, + "loss": 0.7736, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.4131953727130121, + "learning_rate": 0.00018778671413332513, + "loss": 0.761, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.4388218560264117, + "learning_rate": 0.00018770382611226987, + "loss": 0.7594, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.45875231207455275, + "learning_rate": 0.00018762067619427746, + "loss": 0.7495, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.4835149475399592, + "learning_rate": 0.000187537264627646, + "loss": 0.7636, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.4374529769827338, + "learning_rate": 0.00018745359166145523, + "loss": 0.7329, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.4494590328996156, + "learning_rate": 0.00018736965754556528, + "loss": 0.8096, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.4642424475543717, + "learning_rate": 0.00018728546253061614, + "loss": 0.7812, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.47429688727853647, + "learning_rate": 0.00018720100686802694, + "loss": 0.7811, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.448926289660477, + "learning_rate": 0.00018711629080999504, + "loss": 0.821, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.4932646325656744, + "learning_rate": 0.00018703131460949554, + "loss": 0.7139, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.4507269223674137, + "learning_rate": 0.0001869460785202802, + "loss": 0.7674, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.48975523983555974, + "learning_rate": 0.00018686058279687698, + "loss": 0.7927, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.43803365488688517, + "learning_rate": 0.00018677482769458904, + "loss": 0.7569, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.4282559136963825, + "learning_rate": 0.00018668881346949417, + "loss": 0.7743, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.42038876392386426, + "learning_rate": 0.00018660254037844388, + "loss": 0.7075, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.38610534697116794, + "learning_rate": 0.00018651600867906272, + "loss": 0.6382, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.3790510813289947, + "learning_rate": 0.00018642921862974742, + "loss": 0.7069, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.46441742712105416, + "learning_rate": 0.00018634217048966637, + "loss": 0.7565, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.3788528482979111, + "learning_rate": 0.00018625486451875843, + "loss": 0.6764, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.5086491781505815, + "learning_rate": 0.0001861673009777325, + "loss": 0.7939, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4254679186082148, + "learning_rate": 0.0001860794801280666, + "loss": 0.7762, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.4420755098500798, + "learning_rate": 0.00018599140223200716, + "loss": 0.7413, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.39978322577235464, + "learning_rate": 0.0001859030675525681, + "loss": 0.7203, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.44588865008638007, + "learning_rate": 0.0001858144763535302, + "loss": 0.7545, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.4557262705357927, + "learning_rate": 0.0001857256288994402, + "loss": 0.7797, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.4415351173164515, + "learning_rate": 0.00018563652545561013, + "loss": 0.7454, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4478151950710153, + "learning_rate": 0.0001855471662881164, + "loss": 0.741, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.4178673683561562, + "learning_rate": 0.000185457551663799, + "loss": 0.7403, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.39178804569712133, + "learning_rate": 0.00018536768185026083, + "loss": 0.6889, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.49862204004927757, + "learning_rate": 0.00018527755711586678, + "loss": 0.8009, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.46341674560578006, + "learning_rate": 0.00018518717772974302, + "loss": 0.7867, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.4462377795277957, + "learning_rate": 0.00018509654396177609, + "loss": 0.7778, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4168182898396169, + "learning_rate": 0.00018500565608261214, + "loss": 0.7058, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.4108982624761322, + "learning_rate": 0.00018491451436365627, + "loss": 0.7243, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.4121430838074208, + "learning_rate": 0.0001848231190770714, + "loss": 0.7884, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5040950105093712, + "learning_rate": 0.00018473147049577774, + "loss": 0.74, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.6185090953573744, + "learning_rate": 0.00018463956889345194, + "loss": 0.6944, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.4562410017066912, + "learning_rate": 0.00018454741454452603, + "loss": 0.7888, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4557627500595636, + "learning_rate": 0.00018445500772418697, + "loss": 0.7516, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.4493133053651138, + "learning_rate": 0.00018436234870837547, + "loss": 0.7748, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.5046213484524635, + "learning_rate": 0.00018426943777378552, + "loss": 0.8101, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.5725051732278246, + "learning_rate": 0.00018417627519786315, + "loss": 0.7522, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.4502329509184404, + "learning_rate": 0.00018408286125880604, + "loss": 0.7394, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.5010140763094688, + "learning_rate": 0.00018398919623556238, + "loss": 0.7558, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.4541734409806498, + "learning_rate": 0.00018389528040783012, + "loss": 0.7319, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.4692263403190528, + "learning_rate": 0.0001838011140560562, + "loss": 0.7743, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.4375963223108687, + "learning_rate": 0.00018370669746143564, + "loss": 0.7417, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.39942230274315993, + "learning_rate": 0.00018361203090591071, + "loss": 0.6796, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.4249713353545521, + "learning_rate": 0.0001835171146721701, + "loss": 0.7647, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.44763547188312713, + "learning_rate": 0.00018342194904364813, + "loss": 0.7153, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4272917764036961, + "learning_rate": 0.00018332653430452376, + "loss": 0.7357, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.44891642615274496, + "learning_rate": 0.00018323087073971993, + "loss": 0.7529, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.45061301245464935, + "learning_rate": 0.00018313495863490258, + "loss": 0.7357, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.5162333764394879, + "learning_rate": 0.00018303879827647975, + "loss": 0.7415, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.43200165971218957, + "learning_rate": 0.00018294238995160094, + "loss": 0.7658, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.49127325690048357, + "learning_rate": 0.00018284573394815597, + "loss": 0.8219, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4322687546291749, + "learning_rate": 0.00018274883055477436, + "loss": 0.7127, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.4352965224137496, + "learning_rate": 0.00018265168006082437, + "loss": 0.7437, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4038322942096568, + "learning_rate": 0.00018255428275641214, + "loss": 0.681, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.44329753535341127, + "learning_rate": 0.00018245663893238075, + "loss": 0.7773, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.41070677887586216, + "learning_rate": 0.0001823587488803095, + "loss": 0.7477, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.4717483311142247, + "learning_rate": 0.00018226061289251298, + "loss": 0.7002, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4131348832568817, + "learning_rate": 0.00018216223126204007, + "loss": 0.7224, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.38310709778811125, + "learning_rate": 0.00018206360428267332, + "loss": 0.7388, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.5339055401075089, + "learning_rate": 0.00018196473224892784, + "loss": 0.8489, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.5428931097118624, + "learning_rate": 0.00018186561545605054, + "loss": 0.7689, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.506089095574403, + "learning_rate": 0.0001817662542000192, + "loss": 0.6999, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.5000636143770371, + "learning_rate": 0.0001816666487775416, + "loss": 0.7746, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.439603252476578, + "learning_rate": 0.00018156679948605467, + "loss": 0.7477, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.41888613720812434, + "learning_rate": 0.00018146670662372354, + "loss": 0.7187, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.5425059948079629, + "learning_rate": 0.0001813663704894407, + "loss": 0.7816, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4452587992918538, + "learning_rate": 0.00018126579138282503, + "loss": 0.8067, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.44327636724949143, + "learning_rate": 0.00018116496960422107, + "loss": 0.6989, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.4522294368472925, + "learning_rate": 0.00018106390545469795, + "loss": 0.7711, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.4361127825366765, + "learning_rate": 0.0001809625992360485, + "loss": 0.7249, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.42057147948553086, + "learning_rate": 0.00018086105125078857, + "loss": 0.7108, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.4180384592963417, + "learning_rate": 0.00018075926180215576, + "loss": 0.7167, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4039847626846038, + "learning_rate": 0.00018065723119410884, + "loss": 0.7019, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.4932029576011835, + "learning_rate": 0.0001805549597313267, + "loss": 0.8274, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.40645071321038856, + "learning_rate": 0.0001804524477192075, + "loss": 0.7336, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4418467538632973, + "learning_rate": 0.00018034969546386757, + "loss": 0.7251, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.4593383852135761, + "learning_rate": 0.00018024670327214084, + "loss": 0.816, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.5360074555853559, + "learning_rate": 0.00018014347145157755, + "loss": 0.7608, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.7932283592094297, + "learning_rate": 0.0001800400003104436, + "loss": 0.7652, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.4871171097026091, + "learning_rate": 0.0001799362901577196, + "loss": 0.7527, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.43079017734380354, + "learning_rate": 0.00017983234130309968, + "loss": 0.767, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.43719420402094894, + "learning_rate": 0.00017972815405699103, + "loss": 0.7999, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.5559181175868851, + "learning_rate": 0.00017962372873051252, + "loss": 0.8166, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.47293088861257343, + "learning_rate": 0.00017951906563549397, + "loss": 0.7727, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.4684788662952495, + "learning_rate": 0.00017941416508447536, + "loss": 0.6992, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.4795723386291384, + "learning_rate": 0.00017930902739070562, + "loss": 0.7253, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.5028741627697558, + "learning_rate": 0.00017920365286814183, + "loss": 0.77, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4241329169362859, + "learning_rate": 0.0001790980418314484, + "loss": 0.7144, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.4475491212762779, + "learning_rate": 0.0001789921945959958, + "loss": 0.6521, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.4626738540177879, + "learning_rate": 0.00017888611147786002, + "loss": 0.7443, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.41634357797444965, + "learning_rate": 0.00017877979279382135, + "loss": 0.773, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.5873436781277276, + "learning_rate": 0.00017867323886136348, + "loss": 0.7895, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.4196180679340285, + "learning_rate": 0.00017856644999867264, + "loss": 0.707, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5129971633116558, + "learning_rate": 0.0001784594265246366, + "loss": 0.8006, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.4552676154927619, + "learning_rate": 0.00017835216875884368, + "loss": 0.7615, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.4011866327018788, + "learning_rate": 0.0001782446770215819, + "loss": 0.7422, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.39768724784431175, + "learning_rate": 0.0001781369516338378, + "loss": 0.7593, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.46512417794849265, + "learning_rate": 0.00017802899291729585, + "loss": 0.784, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.410495091377265, + "learning_rate": 0.0001779208011943371, + "loss": 0.7217, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.4481590241534154, + "learning_rate": 0.00017781237678803847, + "loss": 0.7459, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.4110190895695786, + "learning_rate": 0.00017770372002217172, + "loss": 0.7447, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.4399110104984658, + "learning_rate": 0.00017759483122120238, + "loss": 0.7624, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.46463546930000155, + "learning_rate": 0.000177485710710289, + "loss": 0.7351, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.4335397389346068, + "learning_rate": 0.00017737635881528196, + "loss": 0.8026, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.47598294181782197, + "learning_rate": 0.00017726677586272263, + "loss": 0.7617, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.451875150204458, + "learning_rate": 0.00017715696217984235, + "loss": 0.7775, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 1.0696155565442207, + "learning_rate": 0.00017704691809456143, + "loss": 0.7043, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.4155974995971046, + "learning_rate": 0.0001769366439354882, + "loss": 0.7324, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.4597044836427236, + "learning_rate": 0.00017682614003191807, + "loss": 0.7551, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.45732067571731677, + "learning_rate": 0.00017671540671383243, + "loss": 0.7749, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.509382063027756, + "learning_rate": 0.0001766044443118978, + "loss": 0.8071, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4627551430394898, + "learning_rate": 0.00017649325315746478, + "loss": 0.7433, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.46220165634985766, + "learning_rate": 0.00017638183358256696, + "loss": 0.7225, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.4392746142774141, + "learning_rate": 0.00017627018591992018, + "loss": 0.6936, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.4417835047957786, + "learning_rate": 0.0001761583105029213, + "loss": 0.6757, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.44007228791509273, + "learning_rate": 0.00017604620766564723, + "loss": 0.7657, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.4574407043844708, + "learning_rate": 0.00017593387774285412, + "loss": 0.7277, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4142482944991072, + "learning_rate": 0.00017582132106997616, + "loss": 0.7399, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.43754975132798457, + "learning_rate": 0.0001757085379831246, + "loss": 0.7258, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.47015803950012963, + "learning_rate": 0.00017559552881908695, + "loss": 0.7298, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.39654418276783754, + "learning_rate": 0.00017548229391532572, + "loss": 0.742, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.5012584870602004, + "learning_rate": 0.00017536883360997743, + "loss": 0.8511, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.5218265939980339, + "learning_rate": 0.00017525514824185185, + "loss": 0.7946, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4535168055954105, + "learning_rate": 0.00017514123815043074, + "loss": 0.6896, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.4355154631246511, + "learning_rate": 0.00017502710367586687, + "loss": 0.7456, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.5032416045066815, + "learning_rate": 0.0001749127451589832, + "loss": 0.8194, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.42058160115791615, + "learning_rate": 0.00017479816294127152, + "loss": 0.6809, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.47682544144964806, + "learning_rate": 0.00017468335736489177, + "loss": 0.7657, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.4478517958175801, + "learning_rate": 0.00017456832877267084, + "loss": 0.7665, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.43045307970830915, + "learning_rate": 0.0001744530775081015, + "loss": 0.725, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.40541676428183027, + "learning_rate": 0.00017433760391534167, + "loss": 0.706, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.43198955167917824, + "learning_rate": 0.00017422190833921283, + "loss": 0.7422, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.42700566025514264, + "learning_rate": 0.0001741059911251997, + "loss": 0.8277, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.43231994835750615, + "learning_rate": 0.00017398985261944856, + "loss": 0.7047, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.43160023093573086, + "learning_rate": 0.00017387349316876666, + "loss": 0.7397, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4226887446666897, + "learning_rate": 0.000173756913120621, + "loss": 0.6831, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.40323279067236023, + "learning_rate": 0.0001736401128231373, + "loss": 0.7551, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.45430304297247726, + "learning_rate": 0.00017352309262509894, + "loss": 0.7621, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4430349947086925, + "learning_rate": 0.00017340585287594604, + "loss": 0.7392, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.3888799232459701, + "learning_rate": 0.0001732883939257742, + "loss": 0.7134, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.40725016029724603, + "learning_rate": 0.0001731707161253338, + "loss": 0.7194, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.43932329112331703, + "learning_rate": 0.0001730528198260285, + "loss": 0.7664, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.41473461673561246, + "learning_rate": 0.00017293470537991463, + "loss": 0.7426, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.46272892040791574, + "learning_rate": 0.00017281637313969978, + "loss": 0.7554, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.4427706963649241, + "learning_rate": 0.00017269782345874203, + "loss": 0.7821, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.47145971292056843, + "learning_rate": 0.00017257905669104874, + "loss": 0.7646, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.49741608905805534, + "learning_rate": 0.00017246007319127545, + "loss": 0.796, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4413466129054198, + "learning_rate": 0.00017234087331472497, + "loss": 0.6825, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.42218546639118704, + "learning_rate": 0.00017222145741734626, + "loss": 0.6896, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.40492674247425203, + "learning_rate": 0.00017210182585573327, + "loss": 0.7313, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.3965105966499714, + "learning_rate": 0.00017198197898712404, + "loss": 0.7127, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.43131350442620386, + "learning_rate": 0.00017186191716939944, + "loss": 0.6961, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.49782761355130234, + "learning_rate": 0.0001717416407610824, + "loss": 0.7276, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4492292504112796, + "learning_rate": 0.00017162115012133643, + "loss": 0.8129, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.4927718040432481, + "learning_rate": 0.00017150044560996488, + "loss": 0.7421, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.46205345397489017, + "learning_rate": 0.00017137952758740978, + "loss": 0.7244, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4820921906889282, + "learning_rate": 0.00017125839641475072, + "loss": 0.7976, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.4025172261059256, + "learning_rate": 0.00017113705245370368, + "loss": 0.7616, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.44780584646886484, + "learning_rate": 0.00017101549606662024, + "loss": 0.7582, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.48758655722779376, + "learning_rate": 0.00017089372761648616, + "loss": 0.781, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 1.7151408551401242, + "learning_rate": 0.00017077174746692056, + "loss": 0.6497, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.4051295030789714, + "learning_rate": 0.00017064955598217462, + "loss": 0.7334, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.5067476687002975, + "learning_rate": 0.00017052715352713075, + "loss": 0.8223, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.8260851852010912, + "learning_rate": 0.00017040454046730115, + "loss": 0.6899, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.4462804427340422, + "learning_rate": 0.00017028171716882714, + "loss": 0.774, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4838041061473331, + "learning_rate": 0.00017015868399847768, + "loss": 0.7862, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.4234453539963326, + "learning_rate": 0.00017003544132364846, + "loss": 0.6884, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.4579586030373765, + "learning_rate": 0.00016991198951236088, + "loss": 0.7979, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.49001578568876686, + "learning_rate": 0.00016978832893326074, + "loss": 0.7449, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.40425098791151753, + "learning_rate": 0.00016966445995561727, + "loss": 0.6956, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.4057010560932579, + "learning_rate": 0.00016954038294932216, + "loss": 0.6925, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.3636636017347738, + "learning_rate": 0.00016941609828488807, + "loss": 0.6853, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.4185024135524768, + "learning_rate": 0.0001692916063334479, + "loss": 0.7345, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.5271803112585407, + "learning_rate": 0.0001691669074667535, + "loss": 0.7938, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.41509775167313745, + "learning_rate": 0.0001690420020571747, + "loss": 0.6856, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.48121328285184867, + "learning_rate": 0.0001689168904776979, + "loss": 0.7351, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.3365621861142254, + "learning_rate": 0.00016879157310192535, + "loss": 0.6568, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5003697225702642, + "learning_rate": 0.0001686660503040737, + "loss": 0.7644, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.43276308167152966, + "learning_rate": 0.00016854032245897308, + "loss": 0.7071, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.4616294263731802, + "learning_rate": 0.00016841438994206595, + "loss": 0.7714, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4064785119701553, + "learning_rate": 0.00016828825312940592, + "loss": 0.7206, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.48378727060758436, + "learning_rate": 0.00016816191239765667, + "loss": 0.717, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.4290622585368261, + "learning_rate": 0.00016803536812409075, + "loss": 0.7888, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4961258123079247, + "learning_rate": 0.0001679086206865886, + "loss": 0.8515, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.4570765932540526, + "learning_rate": 0.00016778167046363734, + "loss": 0.6933, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.4374985572224133, + "learning_rate": 0.00016765451783432953, + "loss": 0.7807, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.37000039379065575, + "learning_rate": 0.00016752716317836229, + "loss": 0.6932, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.41495774150290576, + "learning_rate": 0.0001673996068760359, + "loss": 0.731, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.3918581040740028, + "learning_rate": 0.00016727184930825288, + "loss": 0.7141, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.41713163300704226, + "learning_rate": 0.0001671438908565167, + "loss": 0.7605, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.43461384226091976, + "learning_rate": 0.00016701573190293077, + "loss": 0.7818, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.5213258996049039, + "learning_rate": 0.00016688737283019706, + "loss": 0.6994, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.43775019357143186, + "learning_rate": 0.00016675881402161536, + "loss": 0.7801, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.40810234181034066, + "learning_rate": 0.00016663005586108176, + "loss": 0.7297, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.451393732520972, + "learning_rate": 0.00016650109873308765, + "loss": 0.6761, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.41830630947376274, + "learning_rate": 0.0001663719430227186, + "loss": 0.7271, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.45765934665251606, + "learning_rate": 0.0001662425891156531, + "loss": 0.7133, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.39266794743367506, + "learning_rate": 0.00016611303739816168, + "loss": 0.6791, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4647808500784693, + "learning_rate": 0.00016598328825710533, + "loss": 0.7461, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.42211551580666573, + "learning_rate": 0.00016585334207993476, + "loss": 0.6743, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.4274305253989335, + "learning_rate": 0.00016572319925468892, + "loss": 0.7091, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.46075449646671, + "learning_rate": 0.000165592860169994, + "loss": 0.7158, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.3829706690823211, + "learning_rate": 0.0001654623252150624, + "loss": 0.6864, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.43166617637463833, + "learning_rate": 0.00016533159477969122, + "loss": 0.7107, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.41566587878888706, + "learning_rate": 0.00016520066925426144, + "loss": 0.7463, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.4861445190929394, + "learning_rate": 0.00016506954902973655, + "loss": 0.7505, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.42899606914335303, + "learning_rate": 0.00016493823449766136, + "loss": 0.7696, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.43587057898117887, + "learning_rate": 0.0001648067260501611, + "loss": 0.7346, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.3758043179957542, + "learning_rate": 0.00016467502407993992, + "loss": 0.6867, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.607467088182954, + "learning_rate": 0.0001645431289802799, + "loss": 0.7239, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4167855700816348, + "learning_rate": 0.0001644110411450398, + "loss": 0.7495, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.4236235965130521, + "learning_rate": 0.00016427876096865394, + "loss": 0.7287, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.4517003758084696, + "learning_rate": 0.00016414628884613107, + "loss": 0.7785, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4092945089652458, + "learning_rate": 0.00016401362517305296, + "loss": 0.739, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.3956011254706419, + "learning_rate": 0.00016388077034557355, + "loss": 0.6638, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.4885937490577909, + "learning_rate": 0.00016374772476041748, + "loss": 0.7528, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5059711043497862, + "learning_rate": 0.00016361448881487914, + "loss": 0.8453, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.40573829573580594, + "learning_rate": 0.00016348106290682118, + "loss": 0.7088, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.465500685621695, + "learning_rate": 0.00016334744743467364, + "loss": 0.7224, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.46349954850323677, + "learning_rate": 0.00016321364279743266, + "loss": 0.7806, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.4084502476742288, + "learning_rate": 0.00016307964939465914, + "loss": 0.748, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.46618026160937986, + "learning_rate": 0.00016294546762647775, + "loss": 0.744, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.46057581494492433, + "learning_rate": 0.0001628110978935756, + "loss": 0.7116, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.4186403126544077, + "learning_rate": 0.0001626765405972011, + "loss": 0.6986, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.4887227639710563, + "learning_rate": 0.00016254179613916278, + "loss": 0.7652, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4232905490912348, + "learning_rate": 0.00016240686492182804, + "loss": 0.7727, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.5453311204995098, + "learning_rate": 0.000162271747348122, + "loss": 0.7893, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.7462875293508915, + "learning_rate": 0.0001621364438215262, + "loss": 0.6562, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.4371718880458593, + "learning_rate": 0.00016200095474607753, + "loss": 0.6955, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.47518833463505966, + "learning_rate": 0.00016186528052636692, + "loss": 0.7473, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.432898725708365, + "learning_rate": 0.0001617294215675382, + "loss": 0.7271, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4257339192815698, + "learning_rate": 0.00016159337827528685, + "loss": 0.721, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.4725311812328365, + "learning_rate": 0.0001614571510558588, + "loss": 0.7695, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.4114555349920045, + "learning_rate": 0.00016132074031604917, + "loss": 0.7029, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.4477687628891849, + "learning_rate": 0.0001611841464632011, + "loss": 0.7415, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.6002534398271313, + "learning_rate": 0.00016104736990520468, + "loss": 0.7381, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.42608905810913983, + "learning_rate": 0.0001609104110504954, + "loss": 0.703, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.40846037614765474, + "learning_rate": 0.0001607732703080532, + "loss": 0.7207, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.4058439104134784, + "learning_rate": 0.00016063594808740113, + "loss": 0.6249, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.41719280094549344, + "learning_rate": 0.00016049844479860422, + "loss": 0.6666, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.4658406262264673, + "learning_rate": 0.00016036076085226814, + "loss": 0.7272, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.42862341920626473, + "learning_rate": 0.00016022289665953808, + "loss": 0.72, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.41527747442009044, + "learning_rate": 0.00016008485263209742, + "loss": 0.7642, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4828736577182294, + "learning_rate": 0.0001599466291821666, + "loss": 0.8004, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.4274931663452689, + "learning_rate": 0.0001598082267225018, + "loss": 0.6735, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.3742559630665149, + "learning_rate": 0.0001596696456663938, + "loss": 0.6951, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.40464248241590833, + "learning_rate": 0.0001595308864276666, + "loss": 0.7478, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.44675532932647755, + "learning_rate": 0.00015939194942067646, + "loss": 0.6929, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.46074773992887674, + "learning_rate": 0.0001592528350603103, + "loss": 0.7217, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.38647957927545307, + "learning_rate": 0.0001591135437619847, + "loss": 0.7062, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.4246676513017054, + "learning_rate": 0.00015897407594164467, + "loss": 0.7475, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.40868851802811557, + "learning_rate": 0.00015883443201576225, + "loss": 0.7239, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.43669275845257827, + "learning_rate": 0.0001586946124013354, + "loss": 0.7675, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.4700495251104396, + "learning_rate": 0.00015855461751588677, + "loss": 0.7668, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.4245985896912604, + "learning_rate": 0.0001584144477774623, + "loss": 0.7146, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.39626767330894036, + "learning_rate": 0.0001582741036046301, + "loss": 0.7086, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.4420705016331542, + "learning_rate": 0.00015813358541647915, + "loss": 0.7244, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.38883265984573706, + "learning_rate": 0.00015799289363261813, + "loss": 0.6957, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.39593162571543766, + "learning_rate": 0.00015785202867317407, + "loss": 0.6991, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.4282959021196411, + "learning_rate": 0.00015771099095879108, + "loss": 0.7266, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.45935899293397536, + "learning_rate": 0.0001575697809106292, + "loss": 0.7648, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.41579751479898225, + "learning_rate": 0.00015742839895036305, + "loss": 0.681, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.45019992044675045, + "learning_rate": 0.00015728684550018064, + "loss": 0.7892, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.4012938728365543, + "learning_rate": 0.0001571451209827821, + "loss": 0.7167, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.44542554540353824, + "learning_rate": 0.00015700322582137827, + "loss": 0.7242, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.5166874109785967, + "learning_rate": 0.00015686116043968972, + "loss": 0.7425, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.3861045162617359, + "learning_rate": 0.00015671892526194516, + "loss": 0.6238, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.38189145539763464, + "learning_rate": 0.0001565765207128805, + "loss": 0.6525, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.45093214082403266, + "learning_rate": 0.0001564339472177373, + "loss": 0.7241, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.3998854029693829, + "learning_rate": 0.00015629120520226165, + "loss": 0.662, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4761640842735284, + "learning_rate": 0.0001561482950927029, + "loss": 0.7143, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.42470657304255977, + "learning_rate": 0.0001560052173158123, + "loss": 0.6674, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.49652677264204886, + "learning_rate": 0.00015586197229884184, + "loss": 0.7792, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.4501076437599239, + "learning_rate": 0.00015571856046954285, + "loss": 0.6878, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.42228597830501624, + "learning_rate": 0.00015557498225616487, + "loss": 0.7351, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.437376548686482, + "learning_rate": 0.0001554312380874542, + "loss": 0.7486, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5178983174543825, + "learning_rate": 0.00015528732839265272, + "loss": 0.7648, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.45741607403630036, + "learning_rate": 0.00015514325360149668, + "loss": 0.7689, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.4354725296031309, + "learning_rate": 0.0001549990141442153, + "loss": 0.6399, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.46844788057159364, + "learning_rate": 0.0001548546104515294, + "loss": 0.7892, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.48302739183458254, + "learning_rate": 0.00015471004295465035, + "loss": 0.8101, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.48983003466422564, + "learning_rate": 0.0001545653120852787, + "loss": 0.7603, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.48213837589840736, + "learning_rate": 0.00015442041827560274, + "loss": 0.7638, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.4342424993196076, + "learning_rate": 0.00015427536195829742, + "loss": 0.7447, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.4117746056539576, + "learning_rate": 0.00015413014356652286, + "loss": 0.6763, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.486834356129473, + "learning_rate": 0.00015398476353392323, + "loss": 0.7339, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.4033184482161715, + "learning_rate": 0.00015383922229462549, + "loss": 0.685, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.4425030536429688, + "learning_rate": 0.00015369352028323774, + "loss": 0.7379, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.5293214087093348, + "learning_rate": 0.00015354765793484834, + "loss": 0.7537, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.5095684563003335, + "learning_rate": 0.0001534016356850244, + "loss": 0.7049, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.4659713562959859, + "learning_rate": 0.0001532554539698105, + "loss": 0.7359, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.6119798190509462, + "learning_rate": 0.00015310911322572753, + "loss": 0.737, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.4708897469837848, + "learning_rate": 0.00015296261388977108, + "loss": 0.729, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.44068333484512756, + "learning_rate": 0.0001528159563994104, + "loss": 0.7278, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.4134928678532226, + "learning_rate": 0.000152669141192587, + "loss": 0.7084, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.47449717510451944, + "learning_rate": 0.00015252216870771345, + "loss": 0.7386, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.4427366038885486, + "learning_rate": 0.00015237503938367186, + "loss": 0.7096, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5097252721080822, + "learning_rate": 0.00015222775365981273, + "loss": 0.7793, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.4473496568576032, + "learning_rate": 0.00015208031197595356, + "loss": 0.7518, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.3866823362517065, + "learning_rate": 0.0001519327147723776, + "loss": 0.6404, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.42438262541061145, + "learning_rate": 0.00015178496248983254, + "loss": 0.7081, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.49120729380307165, + "learning_rate": 0.0001516370555695291, + "loss": 0.6842, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.3898907761516741, + "learning_rate": 0.00015148899445313981, + "loss": 0.6654, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.43020618806679295, + "learning_rate": 0.00015134077958279765, + "loss": 0.718, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.43215457447900185, + "learning_rate": 0.00015119241140109467, + "loss": 0.6917, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.42322867622099, + "learning_rate": 0.00015104389035108077, + "loss": 0.7603, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4036510101547229, + "learning_rate": 0.00015089521687626243, + "loss": 0.6841, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.4862966488933275, + "learning_rate": 0.0001507463914206012, + "loss": 0.7459, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.4179069068821482, + "learning_rate": 0.0001505974144285124, + "loss": 0.7176, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.47772376010986584, + "learning_rate": 0.000150448286344864, + "loss": 0.7915, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.4649768813334875, + "learning_rate": 0.00015029900761497506, + "loss": 0.716, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.6083591803435475, + "learning_rate": 0.00015014957868461458, + "loss": 0.805, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4672825249220747, + "learning_rate": 0.00015000000000000001, + "loss": 0.7404, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.4716621448203455, + "learning_rate": 0.000149850272007796, + "loss": 0.8138, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.466442215550732, + "learning_rate": 0.00014970039515511304, + "loss": 0.7081, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.5398723187413823, + "learning_rate": 0.00014955036988950618, + "loss": 0.7096, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.38001693981880735, + "learning_rate": 0.0001494001966589736, + "loss": 0.6827, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.3791680356368391, + "learning_rate": 0.00014924987591195547, + "loss": 0.7309, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4307431262259101, + "learning_rate": 0.00014909940809733222, + "loss": 0.6981, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.4065214548484782, + "learning_rate": 0.0001489487936644237, + "loss": 0.7182, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.4093230703093243, + "learning_rate": 0.00014879803306298736, + "loss": 0.6789, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.41057555483980834, + "learning_rate": 0.00014864712674321734, + "loss": 0.7525, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.4914727714968542, + "learning_rate": 0.00014849607515574276, + "loss": 0.772, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.40548534436984224, + "learning_rate": 0.00014834487875162657, + "loss": 0.6628, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.4729355852851271, + "learning_rate": 0.00014819353798236427, + "loss": 0.7216, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.5009609876468544, + "learning_rate": 0.00014804205329988225, + "loss": 0.7836, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.3834101745585515, + "learning_rate": 0.00014789042515653687, + "loss": 0.669, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4800530337707906, + "learning_rate": 0.00014773865400511272, + "loss": 0.7649, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.4884464679203419, + "learning_rate": 0.00014758674029882152, + "loss": 0.7542, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.4380897346481966, + "learning_rate": 0.00014743468449130063, + "loss": 0.6753, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4135500392219363, + "learning_rate": 0.00014728248703661182, + "loss": 0.6421, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.427629710378204, + "learning_rate": 0.00014713014838923976, + "loss": 0.7122, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.4228166917204371, + "learning_rate": 0.00014697766900409074, + "loss": 0.6429, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.45349718531469274, + "learning_rate": 0.00014682504933649144, + "loss": 0.7213, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.41425188990800704, + "learning_rate": 0.0001466722898421873, + "loss": 0.7262, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.4175380869497653, + "learning_rate": 0.0001465193909773413, + "loss": 0.6778, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.4794562484958292, + "learning_rate": 0.00014636635319853275, + "loss": 0.7359, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.4403153698666154, + "learning_rate": 0.00014621317696275564, + "loss": 0.7098, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.48244562188451673, + "learning_rate": 0.00014605986272741748, + "loss": 0.7391, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.3828289265791597, + "learning_rate": 0.00014590641095033787, + "loss": 0.6977, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.443166307466767, + "learning_rate": 0.00014575282208974702, + "loss": 0.7117, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.4219277645777576, + "learning_rate": 0.00014559909660428468, + "loss": 0.6872, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.4543729245200786, + "learning_rate": 0.00014544523495299842, + "loss": 0.7291, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.4336134648315488, + "learning_rate": 0.00014529123759534255, + "loss": 0.7074, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.5216204706317024, + "learning_rate": 0.00014513710499117647, + "loss": 0.7158, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4323115758318995, + "learning_rate": 0.0001449828376007636, + "loss": 0.6855, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.38010089550321274, + "learning_rate": 0.00014482843588476974, + "loss": 0.6704, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.42628614027943795, + "learning_rate": 0.00014467390030426186, + "loss": 0.6376, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.4708167301566809, + "learning_rate": 0.0001445192313207067, + "loss": 0.6968, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.38995110538793865, + "learning_rate": 0.0001443644293959693, + "loss": 0.74, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.36705363589139167, + "learning_rate": 0.00014420949499231172, + "loss": 0.6154, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4028965104030593, + "learning_rate": 0.0001440544285723915, + "loss": 0.6974, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.46764764133097736, + "learning_rate": 0.00014389923059926062, + "loss": 0.7056, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.44110688137694515, + "learning_rate": 0.0001437439015363638, + "loss": 0.7191, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.4061256130298787, + "learning_rate": 0.00014358844184753712, + "loss": 0.6817, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.4491104837769522, + "learning_rate": 0.00014343285199700683, + "loss": 0.7538, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.40016104536571145, + "learning_rate": 0.0001432771324493879, + "loss": 0.6359, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.39878375968038315, + "learning_rate": 0.00014312128366968243, + "loss": 0.6714, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.4724281238370389, + "learning_rate": 0.00014296530612327863, + "loss": 0.7213, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.45664756846229454, + "learning_rate": 0.00014280920027594907, + "loss": 0.7172, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.45508257643312405, + "learning_rate": 0.00014265296659384956, + "loss": 0.8259, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.4194073068520734, + "learning_rate": 0.00014249660554351752, + "loss": 0.6845, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.4229920455577015, + "learning_rate": 0.00014234011759187083, + "loss": 0.7134, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.40939403836641886, + "learning_rate": 0.00014218350320620624, + "loss": 0.7229, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.38061804235872376, + "learning_rate": 0.00014202676285419812, + "loss": 0.6661, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.3673298641903888, + "learning_rate": 0.00014186989700389687, + "loss": 0.6698, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.33974242079986683, + "learning_rate": 0.0001417129061237278, + "loss": 0.6866, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.4676215441430991, + "learning_rate": 0.0001415557906824895, + "loss": 0.7178, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.4156798745638868, + "learning_rate": 0.00014139855114935252, + "loss": 0.6987, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.5414531691031648, + "learning_rate": 0.00014124118799385796, + "loss": 0.6823, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.41556486731924086, + "learning_rate": 0.0001410837016859161, + "loss": 0.7134, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.4421144449207165, + "learning_rate": 0.00014092609269580496, + "loss": 0.7082, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.48901637476282334, + "learning_rate": 0.00014076836149416887, + "loss": 0.7534, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.44142637208023006, + "learning_rate": 0.00014061050855201723, + "loss": 0.725, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.4240057251397034, + "learning_rate": 0.0001404525343407228, + "loss": 0.6957, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.439688615426819, + "learning_rate": 0.0001402944393320206, + "loss": 0.6955, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.49073270188074036, + "learning_rate": 0.00014013622399800627, + "loss": 0.7933, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.4063339004499898, + "learning_rate": 0.00013997788881113489, + "loss": 0.7032, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.45779049125750276, + "learning_rate": 0.00013981943424421932, + "loss": 0.7243, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.404914939178033, + "learning_rate": 0.0001396608607704289, + "loss": 0.6788, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.3972068026390955, + "learning_rate": 0.0001395021688632882, + "loss": 0.6703, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4879619968648427, + "learning_rate": 0.00013934335899667527, + "loss": 0.7201, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.4679964681011993, + "learning_rate": 0.00013918443164482046, + "loss": 0.7735, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.42072753597519685, + "learning_rate": 0.000139025387282305, + "loss": 0.6898, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.4726482957516662, + "learning_rate": 0.00013886622638405952, + "loss": 0.6847, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.4887903834493641, + "learning_rate": 0.0001387069494253626, + "loss": 0.7416, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.40143329891058066, + "learning_rate": 0.0001385475568818394, + "loss": 0.7716, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.41547353873035464, + "learning_rate": 0.00013838804922946027, + "loss": 0.7366, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.40206206668994704, + "learning_rate": 0.00013822842694453924, + "loss": 0.6793, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.41347048078549353, + "learning_rate": 0.0001380686905037327, + "loss": 0.7255, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.46793924880980436, + "learning_rate": 0.00013790884038403795, + "loss": 0.7162, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.38303959136396093, + "learning_rate": 0.00013774887706279165, + "loss": 0.7203, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.4579613008032338, + "learning_rate": 0.0001375888010176686, + "loss": 0.7542, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4553518771062613, + "learning_rate": 0.00013742861272668012, + "loss": 0.7761, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.3753163192522391, + "learning_rate": 0.00013726831266817278, + "loss": 0.7468, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.4037705646029616, + "learning_rate": 0.00013710790132082692, + "loss": 0.7258, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.5474063355670041, + "learning_rate": 0.00013694737916365517, + "loss": 0.7031, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.40706363110628097, + "learning_rate": 0.00013678674667600102, + "loss": 0.6722, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.4504615444140748, + "learning_rate": 0.00013662600433753745, + "loss": 0.6885, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.47968476769032353, + "learning_rate": 0.00013646515262826552, + "loss": 0.7307, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.44061680012937754, + "learning_rate": 0.00013630419202851284, + "loss": 0.6696, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.4305693762750893, + "learning_rate": 0.00013614312301893223, + "loss": 0.7481, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.37222997154847426, + "learning_rate": 0.0001359819460805001, + "loss": 0.6699, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.41079752183961143, + "learning_rate": 0.00013582066169451535, + "loss": 0.7348, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.40773869451985867, + "learning_rate": 0.0001356592703425976, + "loss": 0.6857, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.47169748823654517, + "learning_rate": 0.0001354977725066859, + "loss": 0.7367, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.45001025674489187, + "learning_rate": 0.00013533616866903735, + "loss": 0.6541, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.4345333849845057, + "learning_rate": 0.0001351744593122255, + "loss": 0.7513, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4438907915023372, + "learning_rate": 0.00013501264491913906, + "loss": 0.7784, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.4133629646551236, + "learning_rate": 0.00013485072597298038, + "loss": 0.763, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.4214493595283753, + "learning_rate": 0.00013468870295726398, + "loss": 0.7029, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.402566253476059, + "learning_rate": 0.0001345265763558152, + "loss": 0.7763, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.4395179020145512, + "learning_rate": 0.00013436434665276865, + "loss": 0.7388, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.4019633578648377, + "learning_rate": 0.00013420201433256689, + "loss": 0.7098, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.4221451934470314, + "learning_rate": 0.00013403957987995882, + "loss": 0.7585, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.49205607465247114, + "learning_rate": 0.00013387704377999842, + "loss": 0.6805, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.4687478768923137, + "learning_rate": 0.00013371440651804313, + "loss": 0.6652, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4420764881796587, + "learning_rate": 0.0001335516685797525, + "loss": 0.7045, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.48578145246982046, + "learning_rate": 0.00013338883045108674, + "loss": 0.7397, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.39700424142863916, + "learning_rate": 0.00013322589261830517, + "loss": 0.72, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.45825900340675857, + "learning_rate": 0.00013306285556796495, + "loss": 0.7296, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.45527861986215046, + "learning_rate": 0.0001328997197869194, + "loss": 0.6884, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.4481830986741094, + "learning_rate": 0.0001327364857623168, + "loss": 0.7575, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4987312925439467, + "learning_rate": 0.00013257315398159864, + "loss": 0.6603, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.39534240050858144, + "learning_rate": 0.00013240972493249847, + "loss": 0.6948, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.3968442688352337, + "learning_rate": 0.0001322461991030402, + "loss": 0.7356, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.39866634478629737, + "learning_rate": 0.00013208257698153677, + "loss": 0.7222, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.4371120490452384, + "learning_rate": 0.00013191885905658872, + "loss": 0.7211, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.43599166673531314, + "learning_rate": 0.0001317550458170826, + "loss": 0.7457, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.4492614272388021, + "learning_rate": 0.00013159113775218964, + "loss": 0.7184, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.4129896919942516, + "learning_rate": 0.00013142713535136414, + "loss": 0.7208, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.40376552851306863, + "learning_rate": 0.00013126303910434214, + "loss": 0.7316, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4185572212312969, + "learning_rate": 0.00013109884950114007, + "loss": 0.7052, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.4881424014088192, + "learning_rate": 0.00013093456703205288, + "loss": 0.7074, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.4610877464624192, + "learning_rate": 0.00013077019218765305, + "loss": 0.7387, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.44572746138602026, + "learning_rate": 0.00013060572545878875, + "loss": 0.663, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.36716569675658006, + "learning_rate": 0.0001304411673365826, + "loss": 0.6498, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.36917178328838557, + "learning_rate": 0.0001302765183124302, + "loss": 0.6883, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4047570582334391, + "learning_rate": 0.00013011177887799845, + "loss": 0.7388, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.45436952725535107, + "learning_rate": 0.00012994694952522435, + "loss": 0.8112, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.42679066990560155, + "learning_rate": 0.00012978203074631334, + "loss": 0.684, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4382352768580495, + "learning_rate": 0.00012961702303373795, + "loss": 0.7128, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.4145094988517289, + "learning_rate": 0.00012945192688023624, + "loss": 0.6848, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.36075755397066733, + "learning_rate": 0.0001292867427788104, + "loss": 0.6619, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.3712743508124415, + "learning_rate": 0.00012912147122272523, + "loss": 0.6506, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.37546797432265555, + "learning_rate": 0.00012895611270550666, + "loss": 0.6462, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.3767351358646228, + "learning_rate": 0.0001287906677209403, + "loss": 0.6773, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4710095609101957, + "learning_rate": 0.00012862513676307008, + "loss": 0.7257, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.36281296561704746, + "learning_rate": 0.0001284595203261965, + "loss": 0.5821, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.42079112992195267, + "learning_rate": 0.00012829381890487536, + "loss": 0.7005, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.43713247867130345, + "learning_rate": 0.00012812803299391628, + "loss": 0.6935, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.40721407933818515, + "learning_rate": 0.00012796216308838117, + "loss": 0.6983, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.416091539276782, + "learning_rate": 0.00012779620968358273, + "loss": 0.7329, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3551946822819585, + "learning_rate": 0.00012763017327508305, + "loss": 0.6448, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.40351741659392415, + "learning_rate": 0.00012746405435869198, + "loss": 0.7009, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.4787621556554398, + "learning_rate": 0.00012729785343046588, + "loss": 0.7258, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.3892528430573125, + "learning_rate": 0.0001271315709867059, + "loss": 0.6523, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.37727840676156577, + "learning_rate": 0.00012696520752395672, + "loss": 0.5908, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.43095274549454, + "learning_rate": 0.00012679876353900482, + "loss": 0.7181, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.38310526976068127, + "learning_rate": 0.00012663223952887723, + "loss": 0.6361, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.5635091893555705, + "learning_rate": 0.00012646563599083996, + "loss": 0.704, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.41744990618571426, + "learning_rate": 0.00012629895342239643, + "loss": 0.7536, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.40509174607945353, + "learning_rate": 0.00012613219232128608, + "loss": 0.6173, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.3940937013071912, + "learning_rate": 0.00012596535318548289, + "loss": 0.6644, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.4578480614007167, + "learning_rate": 0.0001257984365131938, + "loss": 0.7495, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.39450558796923313, + "learning_rate": 0.00012563144280285741, + "loss": 0.6689, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.4470652101566401, + "learning_rate": 0.00012546437255314222, + "loss": 0.6808, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.39295591566329835, + "learning_rate": 0.0001252972262629454, + "loss": 0.7356, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.42222344198393386, + "learning_rate": 0.00012513000443139112, + "loss": 0.6428, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.4625162506790576, + "learning_rate": 0.00012496270755782914, + "loss": 0.8178, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.4022265574948015, + "learning_rate": 0.00012479533614183334, + "loss": 0.692, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.6056096144027203, + "learning_rate": 0.00012462789068320017, + "loss": 0.6708, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.46250737240915346, + "learning_rate": 0.00012446037168194714, + "loss": 0.7153, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.4117622696079509, + "learning_rate": 0.00012429277963831148, + "loss": 0.6545, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.41224353888067033, + "learning_rate": 0.00012412511505274844, + "loss": 0.6996, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.3624384260730754, + "learning_rate": 0.00012395737842592995, + "loss": 0.6831, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.4377808159656789, + "learning_rate": 0.000123789570258743, + "loss": 0.7198, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.36137625136521534, + "learning_rate": 0.00012362169105228826, + "loss": 0.6056, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.37493121821106284, + "learning_rate": 0.00012345374130787854, + "loss": 0.6888, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.3718927110258117, + "learning_rate": 0.00012328572152703725, + "loss": 0.657, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5031044687304947, + "learning_rate": 0.000123117632211497, + "loss": 0.744, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.43078111700694655, + "learning_rate": 0.00012294947386319794, + "loss": 0.6694, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.4639228000745811, + "learning_rate": 0.0001227812469842864, + "loss": 0.7747, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.39758629630564035, + "learning_rate": 0.00012261295207711346, + "loss": 0.6426, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.4063459316169701, + "learning_rate": 0.00012244458964423327, + "loss": 0.7086, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.4748592102336087, + "learning_rate": 0.00012227616018840154, + "loss": 0.7329, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4205979075975615, + "learning_rate": 0.0001221076642125742, + "loss": 0.6909, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.44685321510698384, + "learning_rate": 0.00012193910221990581, + "loss": 0.6986, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.42504187867710974, + "learning_rate": 0.00012177047471374807, + "loss": 0.6404, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.49042418899558937, + "learning_rate": 0.00012160178219764837, + "loss": 0.7014, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.41850019432821417, + "learning_rate": 0.0001214330251753481, + "loss": 0.7016, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.45347906795750986, + "learning_rate": 0.00012126420415078132, + "loss": 0.7231, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.387587711988338, + "learning_rate": 0.00012109531962807332, + "loss": 0.715, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.41162680743585645, + "learning_rate": 0.00012092637211153885, + "loss": 0.6825, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.4142208409837441, + "learning_rate": 0.0001207573621056809, + "loss": 0.676, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.42920067190846045, + "learning_rate": 0.00012058829011518896, + "loss": 0.8196, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.4638661761075869, + "learning_rate": 0.00012041915664493761, + "loss": 0.7171, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.4742798926425448, + "learning_rate": 0.00012024996219998517, + "loss": 0.6643, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.37690347692164017, + "learning_rate": 0.00012008070728557186, + "loss": 0.6683, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.4042102819655462, + "learning_rate": 0.00011991139240711857, + "loss": 0.669, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.3633750799310292, + "learning_rate": 0.00011974201807022525, + "loss": 0.6655, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4772890784984497, + "learning_rate": 0.00011957258478066931, + "loss": 0.6983, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.4067014806459648, + "learning_rate": 0.00011940309304440433, + "loss": 0.7198, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.4342151246933155, + "learning_rate": 0.00011923354336755835, + "loss": 0.7167, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.4834339455954333, + "learning_rate": 0.00011906393625643244, + "loss": 0.6081, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.4679919863240487, + "learning_rate": 0.00011889427221749916, + "loss": 0.7148, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.3728667535780497, + "learning_rate": 0.00011872455175740112, + "loss": 0.7109, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4180859609410972, + "learning_rate": 0.00011855477538294935, + "loss": 0.6249, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.4440989914199599, + "learning_rate": 0.00011838494360112185, + "loss": 0.7309, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.6043139218010019, + "learning_rate": 0.00011821505691906216, + "loss": 0.8382, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.48735406987908186, + "learning_rate": 0.00011804511584407763, + "loss": 0.707, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.4125045692505512, + "learning_rate": 0.00011787512088363817, + "loss": 0.6703, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.5310983266217417, + "learning_rate": 0.00011770507254537453, + "loss": 0.7686, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5097574837010186, + "learning_rate": 0.00011753497133707679, + "loss": 0.7524, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.47956622154740414, + "learning_rate": 0.00011736481776669306, + "loss": 0.685, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.4561679545204111, + "learning_rate": 0.00011719461234232764, + "loss": 0.7041, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4154388504558048, + "learning_rate": 0.00011702435557223987, + "loss": 0.6099, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.4543256122908966, + "learning_rate": 0.00011685404796484225, + "loss": 0.6958, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.41312564730628, + "learning_rate": 0.00011668369002869912, + "loss": 0.6957, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.44743686800004545, + "learning_rate": 0.00011651328227252517, + "loss": 0.7224, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.46432629274912773, + "learning_rate": 0.00011634282520518383, + "loss": 0.6909, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.4492010051884188, + "learning_rate": 0.00011617231933568578, + "loss": 0.7444, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4040632376780653, + "learning_rate": 0.00011600176517318741, + "loss": 0.6699, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.3778954928686016, + "learning_rate": 0.00011583116322698935, + "loss": 0.6573, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.42941828850566793, + "learning_rate": 0.00011566051400653486, + "loss": 0.6319, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4268571606261546, + "learning_rate": 0.00011548981802140848, + "loss": 0.7451, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.39453614407684384, + "learning_rate": 0.00011531907578133429, + "loss": 0.7385, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.4036651489847049, + "learning_rate": 0.00011514828779617459, + "loss": 0.6389, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.3973231219955467, + "learning_rate": 0.00011497745457592816, + "loss": 0.713, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.3938388324647291, + "learning_rate": 0.00011480657663072896, + "loss": 0.6975, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.38572634041719334, + "learning_rate": 0.00011463565447084445, + "loss": 0.6308, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.46729208523626325, + "learning_rate": 0.00011446468860667421, + "loss": 0.718, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.4124496636141174, + "learning_rate": 0.00011429367954874819, + "loss": 0.6791, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.42916527827309303, + "learning_rate": 0.0001141226278077254, + "loss": 0.664, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.3858954342354503, + "learning_rate": 0.00011395153389439233, + "loss": 0.6626, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.394754016424999, + "learning_rate": 0.00011378039831966134, + "loss": 0.7088, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.4383828229399499, + "learning_rate": 0.00011360922159456928, + "loss": 0.6659, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4018923458310357, + "learning_rate": 0.00011343800423027582, + "loss": 0.716, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.40402159693430423, + "learning_rate": 0.00011326674673806195, + "loss": 0.6761, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.39471185256601704, + "learning_rate": 0.00011309544962932862, + "loss": 0.6687, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.37248706946436677, + "learning_rate": 0.0001129241134155949, + "loss": 0.6876, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.43095559414585216, + "learning_rate": 0.00011275273860849684, + "loss": 0.7514, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.37468690060253285, + "learning_rate": 0.00011258132571978555, + "loss": 0.6628, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.39739859446740033, + "learning_rate": 0.00011240987526132594, + "loss": 0.6563, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.4069890132906869, + "learning_rate": 0.00011223838774509514, + "loss": 0.7077, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.47143700583514053, + "learning_rate": 0.00011206686368318086, + "loss": 0.7081, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4738869800644348, + "learning_rate": 0.00011189530358778005, + "loss": 0.6958, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.38031191606305387, + "learning_rate": 0.00011172370797119712, + "loss": 0.6723, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.46867394581105737, + "learning_rate": 0.00011155207734584263, + "loss": 0.7427, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.40511505624824656, + "learning_rate": 0.00011138041222423177, + "loss": 0.6834, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.37366106956900996, + "learning_rate": 0.00011120871311898254, + "loss": 0.679, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.4043939330555839, + "learning_rate": 0.0001110369805428146, + "loss": 0.6883, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.4260025289606062, + "learning_rate": 0.00011086521500854745, + "loss": 0.7284, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.39658217703840415, + "learning_rate": 0.0001106934170290991, + "loss": 0.717, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.4631914880344376, + "learning_rate": 0.00011052158711748434, + "loss": 0.7042, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4275072332001429, + "learning_rate": 0.00011034972578681338, + "loss": 0.6456, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.496698488911602, + "learning_rate": 0.00011017783355029026, + "loss": 0.7391, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.4266729730283349, + "learning_rate": 0.00011000591092121127, + "loss": 0.6477, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.40492842730773204, + "learning_rate": 0.00010983395841296348, + "loss": 0.7057, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.417203016214931, + "learning_rate": 0.0001096619765390232, + "loss": 0.7141, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.3596592617100103, + "learning_rate": 0.00010948996581295436, + "loss": 0.6502, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.38557498729928935, + "learning_rate": 0.00010931792674840718, + "loss": 0.6876, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.40103013114864566, + "learning_rate": 0.00010914585985911632, + "loss": 0.6916, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.38436379613282606, + "learning_rate": 0.00010897376565889971, + "loss": 0.5922, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.47627052391103164, + "learning_rate": 0.00010880164466165674, + "loss": 0.7174, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.4073969434937531, + "learning_rate": 0.00010862949738136681, + "loss": 0.6773, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.3773290918934115, + "learning_rate": 0.00010845732433208779, + "loss": 0.6248, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.43903890663176887, + "learning_rate": 0.00010828512602795462, + "loss": 0.6832, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.39105490582606944, + "learning_rate": 0.00010811290298317755, + "loss": 0.6208, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.36021733418000385, + "learning_rate": 0.00010794065571204072, + "loss": 0.5952, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.3646626277686097, + "learning_rate": 0.00010776838472890065, + "loss": 0.6183, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.40163882230877435, + "learning_rate": 0.00010759609054818458, + "loss": 0.6859, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.4406773319947825, + "learning_rate": 0.00010742377368438914, + "loss": 0.7309, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.400770912566185, + "learning_rate": 0.00010725143465207867, + "loss": 0.6978, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.4090972088523414, + "learning_rate": 0.00010707907396588361, + "loss": 0.6985, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.4430850034506034, + "learning_rate": 0.0001069066921404992, + "loss": 0.7171, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5104233164266083, + "learning_rate": 0.00010673428969068364, + "loss": 0.7206, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.5337145411704607, + "learning_rate": 0.00010656186713125689, + "loss": 0.7028, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.4654800068840646, + "learning_rate": 0.0001063894249770989, + "loss": 0.7164, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.45156944572502017, + "learning_rate": 0.00010621696374314807, + "loss": 0.7286, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.4488343394437967, + "learning_rate": 0.00010604448394439983, + "loss": 0.6555, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.4176500164666214, + "learning_rate": 0.00010587198609590505, + "loss": 0.7031, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.4568540494066028, + "learning_rate": 0.00010569947071276847, + "loss": 0.6915, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.36636546293185185, + "learning_rate": 0.00010552693831014726, + "loss": 0.6404, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.43114833573416217, + "learning_rate": 0.0001053543894032493, + "loss": 0.6521, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3939861121073195, + "learning_rate": 0.00010518182450733186, + "loss": 0.6629, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.4260211410168558, + "learning_rate": 0.00010500924413769988, + "loss": 0.7409, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.3940632056381582, + "learning_rate": 0.00010483664880970457, + "loss": 0.6455, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.47128360899565275, + "learning_rate": 0.00010466403903874176, + "loss": 0.7293, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.4570639352777145, + "learning_rate": 0.00010449141534025045, + "loss": 0.7703, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.44166486237645275, + "learning_rate": 0.00010431877822971117, + "loss": 0.6754, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4291141079011354, + "learning_rate": 0.00010414612822264455, + "loss": 0.7155, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.43701127729116224, + "learning_rate": 0.00010397346583460971, + "loss": 0.7297, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.41251108166319733, + "learning_rate": 0.0001038007915812028, + "loss": 0.7382, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.44539839009574256, + "learning_rate": 0.00010362810597805526, + "loss": 0.7178, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.35839327545970345, + "learning_rate": 0.0001034554095408326, + "loss": 0.6291, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.4066840426301633, + "learning_rate": 0.00010328270278523256, + "loss": 0.6569, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4381921529525435, + "learning_rate": 0.0001031099862269837, + "loss": 0.7229, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.4095186601877654, + "learning_rate": 0.00010293726038184393, + "loss": 0.7245, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.38636903150512125, + "learning_rate": 0.00010276452576559879, + "loss": 0.6828, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.6260488460130194, + "learning_rate": 0.00010259178289406011, + "loss": 0.816, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.37943573207293746, + "learning_rate": 0.00010241903228306431, + "loss": 0.6282, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.45185514579403613, + "learning_rate": 0.0001022462744484709, + "loss": 0.6902, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.44337785555605214, + "learning_rate": 0.00010207350990616107, + "loss": 0.681, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.47565324340343806, + "learning_rate": 0.00010190073917203589, + "loss": 0.7147, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.43158006078313926, + "learning_rate": 0.00010172796276201503, + "loss": 0.7381, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.4187257063640549, + "learning_rate": 0.0001015551811920351, + "loss": 0.7025, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.37507567267923686, + "learning_rate": 0.00010138239497804804, + "loss": 0.7284, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.4362336355553638, + "learning_rate": 0.00010120960463601976, + "loss": 0.6565, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.41957720951009236, + "learning_rate": 0.00010103681068192845, + "loss": 0.6875, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.42848222659383123, + "learning_rate": 0.00010086401363176305, + "loss": 0.6724, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.38569769294882406, + "learning_rate": 0.00010069121400152181, + "loss": 0.659, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.43222419573723375, + "learning_rate": 0.00010051841230721065, + "loss": 0.7376, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.41762869586994045, + "learning_rate": 0.0001003456090648416, + "loss": 0.6672, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.4151854212676284, + "learning_rate": 0.00010017280479043147, + "loss": 0.681, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4013939727263469, + "learning_rate": 0.0001, + "loss": 0.6539, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.44025614029414273, + "learning_rate": 9.982719520956855e-05, + "loss": 0.627, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.4851943989181806, + "learning_rate": 9.965439093515841e-05, + "loss": 0.6842, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.4806947511141547, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6682, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.4117238232036571, + "learning_rate": 9.930878599847821e-05, + "loss": 0.6487, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.47733044825034754, + "learning_rate": 9.913598636823693e-05, + "loss": 0.7449, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.48317342547621384, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7296, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.3268073323556687, + "learning_rate": 9.879039536398024e-05, + "loss": 0.6212, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.3915524465052016, + "learning_rate": 9.861760502195197e-05, + "loss": 0.7185, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.3817787587426184, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7066, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.41660734367905067, + "learning_rate": 9.827203723798498e-05, + "loss": 0.6731, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.46479312837310793, + "learning_rate": 9.809926082796415e-05, + "loss": 0.684, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.4008729136774808, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6768, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.4242250122226439, + "learning_rate": 9.775372555152912e-05, + "loss": 0.6652, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.43686101699854446, + "learning_rate": 9.758096771693573e-05, + "loss": 0.7035, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.41471733807270394, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6341, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.4216582084990744, + "learning_rate": 9.723547423440122e-05, + "loss": 0.6415, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.3600462255058733, + "learning_rate": 9.70627396181561e-05, + "loss": 0.6844, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4057188519578744, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6586, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.39840093105740715, + "learning_rate": 9.671729721476746e-05, + "loss": 0.6503, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.4256026109442371, + "learning_rate": 9.654459045916743e-05, + "loss": 0.7083, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.5092089395428362, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6475, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.38207338014783776, + "learning_rate": 9.619920841879725e-05, + "loss": 0.6826, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.4082864767051089, + "learning_rate": 9.602653416539031e-05, + "loss": 0.6569, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.458709563299764, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6756, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.40946611755676315, + "learning_rate": 9.568122177028884e-05, + "loss": 0.6861, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.3739907081716999, + "learning_rate": 9.550858465974958e-05, + "loss": 0.6836, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.4856089213522823, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6764, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.3766340947339158, + "learning_rate": 9.516335119029546e-05, + "loss": 0.6089, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.3819603841632572, + "learning_rate": 9.499075586230013e-05, + "loss": 0.6088, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4723446206807123, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7164, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.4358190408613106, + "learning_rate": 9.464561059675073e-05, + "loss": 0.7187, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.42383727681651934, + "learning_rate": 9.44730616898528e-05, + "loss": 0.6573, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.36709869944338097, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6417, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4062515478356949, + "learning_rate": 9.412801390409497e-05, + "loss": 0.6724, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.4354605385703807, + "learning_rate": 9.395551605560018e-05, + "loss": 0.7404, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4763139923022005, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7233, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.3750523405124464, + "learning_rate": 9.361057502290113e-05, + "loss": 0.6729, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.42329434731731186, + "learning_rate": 9.343813286874312e-05, + "loss": 0.6796, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.3859846464386311, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6804, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.41894947258061355, + "learning_rate": 9.309330785950086e-05, + "loss": 0.6697, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.4314825350716821, + "learning_rate": 9.292092603411641e-05, + "loss": 0.6496, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4714068042823263, + "learning_rate": 9.274856534792138e-05, + "loss": 0.6685, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.4283526186401107, + "learning_rate": 9.257622631561085e-05, + "loss": 0.7105, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.47671975373092673, + "learning_rate": 9.240390945181543e-05, + "loss": 0.7895, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.42490613681725325, + "learning_rate": 9.223161527109937e-05, + "loss": 0.755, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.43598164411052964, + "learning_rate": 9.205934428795929e-05, + "loss": 0.5963, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.5044110827187931, + "learning_rate": 9.188709701682247e-05, + "loss": 0.6823, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4163631762938455, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6874, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.6574527142102977, + "learning_rate": 9.154267566791223e-05, + "loss": 0.6819, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.41503089259838405, + "learning_rate": 9.137050261863324e-05, + "loss": 0.6267, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.4371772499619709, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7382, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.42666590640185137, + "learning_rate": 9.102623434110028e-05, + "loss": 0.7149, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.36601998126699087, + "learning_rate": 9.085414014088369e-05, + "loss": 0.6834, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.5437344236236936, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7006, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.38132873616786067, + "learning_rate": 9.051003418704565e-05, + "loss": 0.7248, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.47486074810590434, + "learning_rate": 9.033802346097682e-05, + "loss": 0.7776, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.45698397555151876, + "learning_rate": 9.016604158703654e-05, + "loss": 0.6543, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.4222745211512193, + "learning_rate": 8.999408907878877e-05, + "loss": 0.6682, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.3975252007203213, + "learning_rate": 8.982216644970979e-05, + "loss": 0.7073, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.36462621894528313, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6548, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.46299205272186583, + "learning_rate": 8.947841288251568e-05, + "loss": 0.7228, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.4099166364848886, + "learning_rate": 8.930658297090091e-05, + "loss": 0.6679, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.3563235630214903, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6399, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.42017677018775823, + "learning_rate": 8.896301945718541e-05, + "loss": 0.6557, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.37422163801109426, + "learning_rate": 8.879128688101749e-05, + "loss": 0.6781, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4244094126169354, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7105, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.4273457906424879, + "learning_rate": 8.844792265415738e-05, + "loss": 0.7027, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.5429630506062204, + "learning_rate": 8.827629202880293e-05, + "loss": 0.7273, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.3945842034493147, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6233, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.4276874529823611, + "learning_rate": 8.793313631681915e-05, + "loss": 0.6882, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.4383449056836858, + "learning_rate": 8.776161225490489e-05, + "loss": 0.7074, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4664996080845354, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6848, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.3718922182224511, + "learning_rate": 8.741867428021446e-05, + "loss": 0.6321, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.4078353276598548, + "learning_rate": 8.724726139150318e-05, + "loss": 0.7739, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3824518450621971, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6549, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.38449336937285716, + "learning_rate": 8.690455037067141e-05, + "loss": 0.6315, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.44187691432038917, + "learning_rate": 8.673325326193806e-05, + "loss": 0.6751, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.43181725311216423, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6939, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.38920706971716057, + "learning_rate": 8.639077840543077e-05, + "loss": 0.6679, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.45150061599409774, + "learning_rate": 8.621960168033867e-05, + "loss": 0.6919, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.3585658057427239, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6491, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.4336551483648102, + "learning_rate": 8.587737219227462e-05, + "loss": 0.6221, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.415951014350006, + "learning_rate": 8.570632045125185e-05, + "loss": 0.6321, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.422624518486723, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6913, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.5028905039257692, + "learning_rate": 8.536434552915556e-05, + "loss": 0.7145, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.38879350823351877, + "learning_rate": 8.519342336927105e-05, + "loss": 0.6286, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.4064224765126606, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6805, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.44986365222772284, + "learning_rate": 8.485171220382545e-05, + "loss": 0.6571, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.44248101210506857, + "learning_rate": 8.468092421866573e-05, + "loss": 0.6665, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.44571400690033497, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7224, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.40878613792979684, + "learning_rate": 8.433948599346516e-05, + "loss": 0.6553, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.3776861200286177, + "learning_rate": 8.416883677301069e-05, + "loss": 0.6324, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.5449941507254645, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6746, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.42283035357198445, + "learning_rate": 8.382768066431425e-05, + "loss": 0.7079, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.39700699285483754, + "learning_rate": 8.36571747948162e-05, + "loss": 0.7007, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.43262492021103666, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6409, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.4472394720336021, + "learning_rate": 8.33163099713009e-05, + "loss": 0.7294, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.35841758822880526, + "learning_rate": 8.31459520351578e-05, + "loss": 0.5765, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.47823220967303554, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6573, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.40298371612589157, + "learning_rate": 8.280538765767235e-05, + "loss": 0.6166, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.44584424650919, + "learning_rate": 8.263518223330697e-05, + "loss": 0.629, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4208691909512677, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7249, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.3491792959128797, + "learning_rate": 8.22949274546255e-05, + "loss": 0.6691, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.36491087398456773, + "learning_rate": 8.212487911636184e-05, + "loss": 0.5796, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.40813117896238155, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6867, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.44940589005817705, + "learning_rate": 8.178494308093789e-05, + "loss": 0.6958, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.4144835924770522, + "learning_rate": 8.161505639887817e-05, + "loss": 0.6491, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.39707376034497954, + "learning_rate": 8.144522461705067e-05, + "loss": 0.5771, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.42123790275817014, + "learning_rate": 8.127544824259889e-05, + "loss": 0.689, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.37312709892533785, + "learning_rate": 8.110572778250085e-05, + "loss": 0.6464, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.45093709079037364, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6174, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.4031053081919436, + "learning_rate": 8.076645663244168e-05, + "loss": 0.6632, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.42902271009367576, + "learning_rate": 8.059690695559568e-05, + "loss": 0.6407, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.43050650172847615, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6925, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.42447102508079837, + "learning_rate": 8.025798192977481e-05, + "loss": 0.6767, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.4209436167654291, + "learning_rate": 8.008860759288147e-05, + "loss": 0.7065, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.3854169051220057, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7035, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.36149464508948714, + "learning_rate": 7.975003780001485e-05, + "loss": 0.6117, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.5357696910183349, + "learning_rate": 7.958084335506239e-05, + "loss": 0.7273, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.42903143003003225, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6957, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.3888063195583562, + "learning_rate": 7.924263789431912e-05, + "loss": 0.6655, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.3395464244689389, + "learning_rate": 7.907362788846116e-05, + "loss": 0.5678, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.36372538496696344, + "learning_rate": 7.89046803719267e-05, + "loss": 0.5905, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.4435374035943677, + "learning_rate": 7.873579584921869e-05, + "loss": 0.7141, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.3980093384067443, + "learning_rate": 7.856697482465196e-05, + "loss": 0.6604, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.33984862224305995, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6245, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.38295421732248536, + "learning_rate": 7.822952528625191e-05, + "loss": 0.7128, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.4491590184147579, + "learning_rate": 7.806089778009421e-05, + "loss": 0.6315, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.3767130186401547, + "learning_rate": 7.789233578742582e-05, + "loss": 0.572, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.3972548195229604, + "learning_rate": 7.772383981159849e-05, + "loss": 0.6308, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.44055381077950156, + "learning_rate": 7.755541035576677e-05, + "loss": 0.6866, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4104075619845114, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6666, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.3907331405509662, + "learning_rate": 7.721875301571359e-05, + "loss": 0.6599, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.3847663055317739, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6741, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.4628313913008596, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6593, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.4647925650314762, + "learning_rate": 7.671427847296275e-05, + "loss": 0.7455, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.41261519664681573, + "learning_rate": 7.654625869212146e-05, + "loss": 0.6954, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.45444836790689513, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6606, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.5358891340024584, + "learning_rate": 7.6210429741257e-05, + "loss": 0.7436, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.45521731751388783, + "learning_rate": 7.604262157407007e-05, + "loss": 0.6796, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.394011079694462, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6235, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.5079189076277912, + "learning_rate": 7.570722036168854e-05, + "loss": 0.7749, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.3928134231684419, + "learning_rate": 7.55396283180529e-05, + "loss": 0.6817, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.5111921296335108, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6678, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.4081978794658278, + "learning_rate": 7.520466385816671e-05, + "loss": 0.6628, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.4053211813220869, + "learning_rate": 7.503729244217086e-05, + "loss": 0.6032, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.42983168496764484, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6636, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.4644008516765664, + "learning_rate": 7.470277373705461e-05, + "loss": 0.6703, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.4574504774505039, + "learning_rate": 7.453562744685778e-05, + "loss": 0.7518, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4772708644960783, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6889, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.4435426325155918, + "learning_rate": 7.42015634868062e-05, + "loss": 0.7131, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.4366700926912915, + "learning_rate": 7.403464681451715e-05, + "loss": 0.6809, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.3963571015044681, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6618, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.48419721275297006, + "learning_rate": 7.370104657760361e-05, + "loss": 0.6748, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.38905688201468447, + "learning_rate": 7.353436400916004e-05, + "loss": 0.6205, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.433394986559628, + "learning_rate": 7.336776047112276e-05, + "loss": 0.678, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.496769545403158, + "learning_rate": 7.320123646099519e-05, + "loss": 0.7682, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.36142198179226703, + "learning_rate": 7.303479247604332e-05, + "loss": 0.6383, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.4636565826974786, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6458, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.417421627598741, + "learning_rate": 7.270214656953415e-05, + "loss": 0.6481, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.4472370301922988, + "learning_rate": 7.253594564130804e-05, + "loss": 0.6952, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4088784521699277, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6465, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.46647558279832896, + "learning_rate": 7.22037903164173e-05, + "loss": 0.7393, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.4053385750067436, + "learning_rate": 7.203783691161883e-05, + "loss": 0.6574, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.42757964213529087, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6633, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.4470110173526119, + "learning_rate": 7.170618109512465e-05, + "loss": 0.6793, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.4193340190231634, + "learning_rate": 7.154047967380354e-05, + "loss": 0.7022, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.46156971270395714, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6946, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.6119050871362105, + "learning_rate": 7.12093322790597e-05, + "loss": 0.713, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.3992646868223229, + "learning_rate": 7.104388729449338e-05, + "loss": 0.6214, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5217558447619373, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7425, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.41823835650592406, + "learning_rate": 7.071325722118963e-05, + "loss": 0.6679, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.46847038553389453, + "learning_rate": 7.054807311976379e-05, + "loss": 0.7256, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.4600176726426822, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7218, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.5067414577175797, + "learning_rate": 7.021796925368667e-05, + "loss": 0.7144, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.4557479530568632, + "learning_rate": 7.005305047477566e-05, + "loss": 0.6882, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.3920277763332182, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6492, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.4845316811457574, + "learning_rate": 6.972348168756983e-05, + "loss": 0.637, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.4421634183836985, + "learning_rate": 6.955883266341741e-05, + "loss": 0.6482, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5306364811458931, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6295, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.6189704780148215, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7222, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.438502098448414, + "learning_rate": 6.906543296794714e-05, + "loss": 0.6524, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4339981933957319, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6601, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.38820940513874697, + "learning_rate": 6.873696089565786e-05, + "loss": 0.6566, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.4463376889999683, + "learning_rate": 6.85728646486359e-05, + "loss": 0.6638, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.41195369045521074, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6777, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.3841239347667893, + "learning_rate": 6.82449541829174e-05, + "loss": 0.6372, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.3868369894075657, + "learning_rate": 6.80811409434113e-05, + "loss": 0.6393, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.5146796448296774, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7299, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.43790325034252436, + "learning_rate": 6.775380089695986e-05, + "loss": 0.7157, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.4398587464943831, + "learning_rate": 6.759027506750158e-05, + "loss": 0.6846, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3595461758946452, + "learning_rate": 6.742684601840141e-05, + "loss": 0.5901, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.38109497064771947, + "learning_rate": 6.726351423768322e-05, + "loss": 0.683, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.36004089054337374, + "learning_rate": 6.710028021308061e-05, + "loss": 0.6178, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.42637815638404397, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7173, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.42425891002201527, + "learning_rate": 6.677410738169485e-05, + "loss": 0.6631, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.450524267432063, + "learning_rate": 6.661116954891328e-05, + "loss": 0.6258, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.48785012034753317, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7325, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.3811145492840182, + "learning_rate": 6.62855934819569e-05, + "loss": 0.6453, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.37249313313450727, + "learning_rate": 6.612295622000162e-05, + "loss": 0.6186, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.4432375348141401, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7176, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.4919086266265921, + "learning_rate": 6.579798566743314e-05, + "loss": 0.7397, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.40437823487566327, + "learning_rate": 6.563565334723134e-05, + "loss": 0.5864, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.4684502401638013, + "learning_rate": 6.547342364418481e-05, + "loss": 0.6577, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.36718558673611307, + "learning_rate": 6.531129704273604e-05, + "loss": 0.6323, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.3646893225239305, + "learning_rate": 6.514927402701964e-05, + "loss": 0.6039, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4330441706528994, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6565, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.4141825846758376, + "learning_rate": 6.48255406877745e-05, + "loss": 0.6368, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.4131440350198122, + "learning_rate": 6.466383133096267e-05, + "loss": 0.6301, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.34632135053706137, + "learning_rate": 6.450222749331414e-05, + "loss": 0.5804, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.3492834275916797, + "learning_rate": 6.434072965740242e-05, + "loss": 0.5651, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.454126021627074, + "learning_rate": 6.417933830548467e-05, + "loss": 0.7312, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.43030683923975094, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6441, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.4579726985783972, + "learning_rate": 6.385687698106781e-05, + "loss": 0.6526, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.4667751571777085, + "learning_rate": 6.369580797148718e-05, + "loss": 0.7083, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3716727407951127, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6583, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.4886366499037081, + "learning_rate": 6.337399566246257e-05, + "loss": 0.6991, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.4810398000322877, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6088, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.40501659908197385, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6343, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.3872785713763552, + "learning_rate": 6.289209867917312e-05, + "loss": 0.5938, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.4262127961256045, + "learning_rate": 6.273168733182722e-05, + "loss": 0.6683, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.36594726725387117, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6589, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.5296772394839944, + "learning_rate": 6.241119898233144e-05, + "loss": 0.7472, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.40591808570497256, + "learning_rate": 6.225112293720836e-05, + "loss": 0.6199, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.33499027276495974, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6213, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.4490471112002442, + "learning_rate": 6.19313094962673e-05, + "loss": 0.745, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.4004004209960038, + "learning_rate": 6.177157305546078e-05, + "loss": 0.6736, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4081418069053937, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6877, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.5510970610876371, + "learning_rate": 6.145244311816063e-05, + "loss": 0.6336, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.577670413375373, + "learning_rate": 6.129305057463741e-05, + "loss": 0.6311, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.3905336200852035, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6562, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.46209659837967093, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6951, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.36284719198992044, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.65, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3963866904526784, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6036, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.42869465690151415, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.6931, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.4030905528391357, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.6693, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4299121498011599, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6789, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.3869971495167029, + "learning_rate": 6.002211118886514e-05, + "loss": 0.6083, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.35145059252234884, + "learning_rate": 5.986377600199371e-05, + "loss": 0.6021, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.431472130038551, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6127, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.5211453824777301, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.8389, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.43216368219278184, + "learning_rate": 5.938949144798279e-05, + "loss": 0.664, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3652179821869812, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6094, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.4280942594177932, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6872, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.39141921847889477, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.5987, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4557631876206928, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6478, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.39869783455438407, + "learning_rate": 5.860144885064751e-05, + "loss": 0.6588, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.4479245163416833, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.6612, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.46517723517096476, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6641, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.38834700022906427, + "learning_rate": 5.813010299610313e-05, + "loss": 0.6497, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.45946022382496476, + "learning_rate": 5.797323714580192e-05, + "loss": 0.6648, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.42073745894658116, + "learning_rate": 5.781649679379378e-05, + "loss": 0.643, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.466784594465401, + "learning_rate": 5.765988240812921e-05, + "loss": 0.6769, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.42517084430908664, + "learning_rate": 5.750339445648252e-05, + "loss": 0.6687, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.38895462758587773, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6537, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.48604488584972266, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.5972, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.46578198589028647, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.6648, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3968236236964227, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6607, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.4798199203451227, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.7428, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.4531935558414575, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.6634, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.34558900265033327, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.632, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.43787546322679616, + "learning_rate": 5.625609846363622e-05, + "loss": 0.656, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.43250325432433856, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.6779, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.38833312220396976, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6292, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.44540024272174433, + "learning_rate": 5.579050500768836e-05, + "loss": 0.608, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.3579828149017168, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.6255, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4053175333424152, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6258, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.35858098645794223, + "learning_rate": 5.53260996957381e-05, + "loss": 0.6704, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.3987451827915316, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.6937, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.42997637983274856, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6585, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.3932526654130728, + "learning_rate": 5.486289500882355e-05, + "loss": 0.6462, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.3870233656284184, + "learning_rate": 5.47087624046575e-05, + "loss": 0.6615, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.49712452919958033, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6623, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.35625906326919565, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.6282, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.42291877206495576, + "learning_rate": 5.424717791025302e-05, + "loss": 0.7098, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4253620364990812, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6386, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.3912739689870102, + "learning_rate": 5.394013727258254e-05, + "loss": 0.6086, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.41318139248268376, + "learning_rate": 5.378682303724435e-05, + "loss": 0.7141, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.43295533205841463, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7043, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 1.1398184199401138, + "learning_rate": 5.348060902265871e-05, + "loss": 0.7222, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.3862703219307575, + "learning_rate": 5.332771015781275e-05, + "loss": 0.6426, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4677947520049491, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6662, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.401846080147303, + "learning_rate": 5.302233099590928e-05, + "loss": 0.6521, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.37286357672527154, + "learning_rate": 5.286985161076029e-05, + "loss": 0.6062, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.40748923628071837, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6926, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.4479423524142015, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.6756, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.42234114843813786, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6476, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3896182410264214, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6772, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.4411435829201509, + "learning_rate": 5.210957484346314e-05, + "loss": 0.6234, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.37933568532674966, + "learning_rate": 5.195794670011776e-05, + "loss": 0.6204, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.46265736442828825, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6668, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.4393018454365821, + "learning_rate": 5.165512124837344e-05, + "loss": 0.6645, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.5177754712691343, + "learning_rate": 5.150392484425728e-05, + "loss": 0.6997, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.42607221478118706, + "learning_rate": 5.135287325678271e-05, + "loss": 0.666, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.39882355924879737, + "learning_rate": 5.120196693701267e-05, + "loss": 0.6213, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.38805641941283864, + "learning_rate": 5.105120633557634e-05, + "loss": 0.5997, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.38762433495801163, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6421, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.40016323813150256, + "learning_rate": 5.075012408804458e-05, + "loss": 0.6085, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.37616401930803023, + "learning_rate": 5.059980334102637e-05, + "loss": 0.6258, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.41028978668860133, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7122, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.4042626959520759, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.6362, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.45220879835180783, + "learning_rate": 5.014972799220403e-05, + "loss": 0.709, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.4293227756763883, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6058, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.39955554118477965, + "learning_rate": 4.985042131538545e-05, + "loss": 0.6197, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.42040530700673934, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.6397, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4265123061001254, + "learning_rate": 4.955171365513603e-05, + "loss": 0.649, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.431492915890074, + "learning_rate": 4.940258557148765e-05, + "loss": 0.6597, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.3893191806148624, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.6364, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.4593682382286822, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6714, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.4513437746047561, + "learning_rate": 4.895610964891923e-05, + "loss": 0.679, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.37487776255065614, + "learning_rate": 4.880758859890536e-05, + "loss": 0.6301, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5125427502539084, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6741, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.4875542190779091, + "learning_rate": 4.851100554686021e-05, + "loss": 0.705, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.3913131109343328, + "learning_rate": 4.836294443047088e-05, + "loss": 0.647, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.39692001539739485, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6315, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.3661830086625643, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.6102, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.40088562778368225, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6512, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3968740135921082, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6251, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.4561766053407961, + "learning_rate": 4.762496061632814e-05, + "loss": 0.6348, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.42854281632100244, + "learning_rate": 4.747783129228656e-05, + "loss": 0.5912, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.5283300083820703, + "learning_rate": 4.733085880741301e-05, + "loss": 0.731, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.3956666776345902, + "learning_rate": 4.718404360058966e-05, + "loss": 0.5672, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.3811259172079882, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.6196, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.4769350025419239, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7024, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.41630517232447295, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.6686, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.5956279258644027, + "learning_rate": 4.659836431497563e-05, + "loss": 0.6151, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.40433857542379514, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6602, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.45198972435300755, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6614, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.4242156941762448, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.7417, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.37644881883817727, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6413, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.3922208031071116, + "learning_rate": 4.586985643347717e-05, + "loss": 0.6515, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.48489005861581225, + "learning_rate": 4.572463804170263e-05, + "loss": 0.5778, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3956409256924293, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.643, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.427651978645937, + "learning_rate": 4.543468791472131e-05, + "loss": 0.6556, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.3547420794780449, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.6127, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.41186232406623996, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6259, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.4607083618840504, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.649, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.4424568065373181, + "learning_rate": 4.485674639850333e-05, + "loss": 0.7219, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.5217758891399114, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7046, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.36482890737078805, + "learning_rate": 4.456876191254582e-05, + "loss": 0.605, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.41144418918267145, + "learning_rate": 4.442501774383515e-05, + "loss": 0.6534, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.42698741671175594, + "learning_rate": 4.428143953045717e-05, + "loss": 0.704, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.3781262769116132, + "learning_rate": 4.413802770115816e-05, + "loss": 0.6249, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.3811219502776769, + "learning_rate": 4.399478268418771e-05, + "loss": 0.6517, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.3962703830271651, + "learning_rate": 4.385170490729712e-05, + "loss": 0.647, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.47512105748597355, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.6546, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.3864120022213843, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6675, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.40235347840914154, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6411, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.5395692983578562, + "learning_rate": 4.328107473805487e-05, + "loss": 0.668, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.4633234751653702, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.6619, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4252508666674081, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6064, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.40105004829371027, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.5626, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.42489712099458443, + "learning_rate": 4.271315449981934e-05, + "loss": 0.647, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.43580292129353576, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6945, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.39032402412353395, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.6747, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.4538435511477062, + "learning_rate": 4.228900904120895e-05, + "loss": 0.6278, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.40414914792484313, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6237, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.3715303103103586, + "learning_rate": 4.200710636738189e-05, + "loss": 0.6401, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.3616164453355704, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.6366, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.40727321244216635, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6349, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.3995228220202781, + "learning_rate": 4.158555222253771e-05, + "loss": 0.5915, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.3796897322527262, + "learning_rate": 4.14453824841132e-05, + "loss": 0.623, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4191499271698253, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7008, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.4151839265104562, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.6186, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.4765363310999923, + "learning_rate": 4.102592405835536e-05, + "loss": 0.6983, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.45251210327244407, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7252, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.46979099440068206, + "learning_rate": 4.074716493968975e-05, + "loss": 0.7144, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.400590774372865, + "learning_rate": 4.060805057932359e-05, + "loss": 0.6376, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.3643801004323646, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6127, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.411676426115019, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.6431, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.46124395883928804, + "learning_rate": 4.019177327749822e-05, + "loss": 0.7374, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.435690599029188, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6413, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.4811840917063531, + "learning_rate": 3.991514736790258e-05, + "loss": 0.7198, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.3792243676588303, + "learning_rate": 3.977710334046193e-05, + "loss": 0.6197, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.4062410023533493, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6669, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.5052405679987543, + "learning_rate": 3.950155520139581e-05, + "loss": 0.773, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.414816182297928, + "learning_rate": 3.936405191259891e-05, + "loss": 0.6194, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.46485351218905546, + "learning_rate": 3.922672969194686e-05, + "loss": 0.5999, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.40217650610978745, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.6472, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.41126670572783974, + "learning_rate": 3.895263009479534e-05, + "loss": 0.6621, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.42123602460746146, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.676, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.4253579515095016, + "learning_rate": 3.867925968395085e-05, + "loss": 0.6946, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.3879289872092901, + "learning_rate": 3.854284894414122e-05, + "loss": 0.651, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.47074809141769725, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6865, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.4525618442462892, + "learning_rate": 3.82705784324618e-05, + "loss": 0.6473, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.4211196142878507, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.673, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3932298817755076, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6081, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.4484407693371872, + "learning_rate": 3.786355617847385e-05, + "loss": 0.6591, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.4157957671793489, + "learning_rate": 3.772825265187802e-05, + "loss": 0.7103, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.40025302364842813, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7238, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.3735438460830809, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.6057, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.4258025624491941, + "learning_rate": 3.732345940279893e-05, + "loss": 0.6287, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.41435165265860574, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7342, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.43510629641125437, + "learning_rate": 3.705453237352227e-05, + "loss": 0.6102, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.4155326388136457, + "learning_rate": 3.692035060534088e-05, + "loss": 0.6477, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3920465694406834, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6337, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.3280776536946831, + "learning_rate": 3.665255256532638e-05, + "loss": 0.5867, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.4020150710561621, + "learning_rate": 3.651893709317887e-05, + "loss": 0.6606, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.4321202747337267, + "learning_rate": 3.638551118512089e-05, + "loss": 0.653, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.4145771553289532, + "learning_rate": 3.625227523958252e-05, + "loss": 0.6449, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.4000997935495122, + "learning_rate": 3.611922965442648e-05, + "loss": 0.6388, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.37776776801238715, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.5843, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.38639775734571263, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.6342, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.38802333600420025, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.5957, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4380942056655734, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6544, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.4043243800838281, + "learning_rate": 3.545687101972013e-05, + "loss": 0.6314, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.37685966819642175, + "learning_rate": 3.53249759200601e-05, + "loss": 0.5692, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.5074379766120739, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6473, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.4747154935129227, + "learning_rate": 3.506176550233863e-05, + "loss": 0.6918, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.3809069679768159, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6786, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.3936443972311563, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6358, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.4460387695423164, + "learning_rate": 3.46684052203088e-05, + "loss": 0.6629, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.47158508880496003, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.6718, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.45039878447397613, + "learning_rate": 3.440713983000601e-05, + "loss": 0.594, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.3760296162927113, + "learning_rate": 3.427680074531113e-05, + "loss": 0.6529, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.3887184512734403, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.6598, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4221929479111637, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6554, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.6988186268204654, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6674, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.36250401220062856, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.5585, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.36621982144568793, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6274, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.38601818339233507, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.5878, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.4316403933335295, + "learning_rate": 3.336994413891828e-05, + "loss": 0.6424, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3907795425417193, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6275, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.45314061352335727, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.6697, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.4256923628283321, + "learning_rate": 3.298426809706928e-05, + "loss": 0.668, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5072675399710331, + "learning_rate": 3.285610914348332e-05, + "loss": 0.7495, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.3819982521836467, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.6652, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.4377503488023667, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.6411, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.36867785125647967, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.5995, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.40642499892036954, + "learning_rate": 3.234548216567049e-05, + "loss": 0.6682, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.385843008891069, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.6242, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4325274716602865, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6582, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.46100598318068164, + "learning_rate": 3.196463187590929e-05, + "loss": 0.6108, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.42482416416988833, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.6546, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.39647749049588554, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6523, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.441865136236515, + "learning_rate": 3.158561005793402e-05, + "loss": 0.6661, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.4417314914160828, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6063, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3908224130249587, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6564, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.41361323719504467, + "learning_rate": 3.120842689807468e-05, + "loss": 0.5932, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.45220246474049364, + "learning_rate": 3.108310952230212e-05, + "loss": 0.6369, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.37290705932486357, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.5997, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.3952853062774028, + "learning_rate": 3.083309253324651e-05, + "loss": 0.6158, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.411211223469002, + "learning_rate": 3.070839366655215e-05, + "loss": 0.6074, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.41492456552185275, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6444, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.3567417740050895, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.62, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.41309505589507733, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6294, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.4773359945231895, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6836, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.42567710901459754, + "learning_rate": 3.008801048763914e-05, + "loss": 0.7079, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.40052062915954073, + "learning_rate": 2.996455867635155e-05, + "loss": 0.6326, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.43124179647071165, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.5604, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.42862447927787145, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6885, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.45500083220282006, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.6823, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.37262762163934443, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.63, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.39930643408061656, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6416, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.4450527614039218, + "learning_rate": 2.922825253307947e-05, + "loss": 0.6659, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3509670630771238, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.5801, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.39275889558823696, + "learning_rate": 2.898450393337977e-05, + "loss": 0.6156, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.4365065813793774, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.6213, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.3129303170662575, + "learning_rate": 2.874160358524931e-05, + "loss": 0.5871, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.39102267898629967, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.5783, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.39074303616585854, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.6334, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.47323989776245445, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6175, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.46029568340569943, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.6693, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.4273119427517804, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.632, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.36461203285948796, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.5611, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.4301826055749961, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.6514, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.37385563161907515, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.6295, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.40609506955915836, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6105, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.5230191893009891, + "learning_rate": 2.753992680872457e-05, + "loss": 0.6659, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.3839015364109024, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6077, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.40991190312692427, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6648, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.4183908345604154, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.591, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.3839402985392285, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.6354, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.39316191117423616, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.5511, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.3757351480584375, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.6634, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.38586563028159926, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.6076, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.40401411814743754, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6151, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.4848653050261294, + "learning_rate": 2.647690737490106e-05, + "loss": 0.6956, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.5332637343551315, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.6617, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3740767346843177, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6384, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.3720633228138245, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.609, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.47431548855743244, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.5865, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.4379463383797498, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6096, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.39255985136755134, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6216, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.39246122531624794, + "learning_rate": 2.566239608465838e-05, + "loss": 0.6354, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.41380264443785086, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6603, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.3363711579115939, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6206, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.410785747452271, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.6419, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.38915997907587807, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6565, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.468336153616573, + "learning_rate": 2.508725484101684e-05, + "loss": 0.6887, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.38816798537475805, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.669, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4407475349265452, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6904, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.6455077887809194, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.6925, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.4047821122693816, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.6348, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3904817170860773, + "learning_rate": 2.451770608467432e-05, + "loss": 0.5996, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.4256817343847479, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.6196, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.3564141647024927, + "learning_rate": 2.429146201687538e-05, + "loss": 0.5845, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.43895240241864214, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6411, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.41344241738837445, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6282, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.47764251794889384, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.6491, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.40405844877676395, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6292, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.40747542751942456, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.5761, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.4172261378329325, + "learning_rate": 2.361816641743303e-05, + "loss": 0.5867, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.3880975350407753, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6298, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.454827666158906, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6357, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.4135524865170144, + "learning_rate": 2.328459328616759e-05, + "loss": 0.6125, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.3627291761852448, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.5833, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.41676502372058544, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6458, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.3658004704309575, + "learning_rate": 2.295308190543859e-05, + "loss": 0.5856, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4634414266057261, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6891, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.56074419797481, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.6194, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.4319324344035407, + "learning_rate": 2.262364118471805e-05, + "loss": 0.6244, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.38178275769314285, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6477, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.3917852782025771, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.604, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.43663593848716403, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.6185, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4232853699490676, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6224, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.3658584722646184, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6368, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.44565137919498404, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.6221, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.44278550694929475, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6462, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.4123636761689713, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.7347, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.4079679364330374, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.6451, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.5168076566291907, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6402, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.38173849341906546, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6282, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.4524165538190746, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.6428, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.4143510686204648, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6359, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.4433278715654798, + "learning_rate": 2.111388852214001e-05, + "loss": 0.6412, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.44529038500856755, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.573, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3726690994770168, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6233, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.36030377090053944, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.6211, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.40693922341498406, + "learning_rate": 2.069097260929439e-05, + "loss": 0.5771, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.401872353212445, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6278, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.4802253049924863, + "learning_rate": 2.048093436450603e-05, + "loss": 0.7407, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.3927311504866121, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.6169, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.3980797671464647, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6038, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.396463500581168, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.6191, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.4246288215052101, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.6639, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.36942287784701217, + "learning_rate": 1.995999968955641e-05, + "loss": 0.5835, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.39238846065412564, + "learning_rate": 1.985652854842247e-05, + "loss": 0.6085, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.39605568898365406, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.5944, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.47417463303431173, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6406, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.41033444761054455, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.5882, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.3506013690792776, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.6245, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.3842673745708574, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6776, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.37624110756408163, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.5891, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.44301843398022756, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.6831, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.43605392399193227, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6384, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.4598273335227648, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.5941, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.4141225681986433, + "learning_rate": 1.883503039577894e-05, + "loss": 0.6381, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.42394156093608293, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6768, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.3548570188787065, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.5555, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.39023735618158406, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.6145, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3359493120677617, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.5751, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.4226374951462766, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.6533, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.47239366046471304, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.7022, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.37894693036953175, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.601, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.4026214658454624, + "learning_rate": 1.803526775107217e-05, + "loss": 0.6149, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.3897012031544754, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.601, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4867669436897636, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6589, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.40006855874177877, + "learning_rate": 1.773938710748706e-05, + "loss": 0.6324, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.36040475737051053, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.5511, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.39138607786669155, + "learning_rate": 1.754336106761927e-05, + "loss": 0.582, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.38396535899467, + "learning_rate": 1.744571724358789e-05, + "loss": 0.5841, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.3776215499822429, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.6831, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.43813368686973886, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6174, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.3841077501914417, + "learning_rate": 1.715426605184407e-05, + "loss": 0.6296, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.3892898735979422, + "learning_rate": 1.705761004839911e-05, + "loss": 0.6327, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.40353192605898125, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7106, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.39140572566359955, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6234, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.4384632881438894, + "learning_rate": 1.676912926028007e-05, + "loss": 0.6934, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.5225492263587118, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6741, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.5722418748382089, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.6537, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.40576271575043527, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.6053, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.419476986095844, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.5853, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.36410852383771564, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.5529, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.4096711146858712, + "learning_rate": 1.619888594394382e-05, + "loss": 0.608, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4069274461277458, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6141, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.36476842192995557, + "learning_rate": 1.601080376443763e-05, + "loss": 0.5893, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.42563347102734733, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.6448, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.4692462806578368, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.592, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.43930859709973186, + "learning_rate": 1.573056222621453e-05, + "loss": 0.6242, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.45687506190316407, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.6265, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.399945416164093, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6369, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.3837425889924183, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.649, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.429907781056903, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6715, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.39455565201868215, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6036, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.4080082886430872, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6283, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.3630930563014412, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.5932, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.37576258976580323, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.5762, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.35183356342237204, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.6029, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.4491934616307518, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.6398, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3726502213271665, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.5828, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.40384795307175253, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.5927, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.4366373927791251, + "learning_rate": 1.454244833620102e-05, + "loss": 0.6991, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.39394958900157206, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6119, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.38678026946844857, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.6334, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.39462507397666935, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.5922, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3859662830695169, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.5692, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.38515250216530106, + "learning_rate": 1.409693244743192e-05, + "loss": 0.668, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.456437290699982, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.6638, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4168506986866765, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.5815, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.39807632655886105, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.6329, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.38254096155961537, + "learning_rate": 1.37451354812416e-05, + "loss": 0.661, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4458411190257632, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6396, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.4154278363491857, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.6967, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.40290083595644605, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.651, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4108782186035999, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6025, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.367731559943917, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6344, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.40629503604824857, + "learning_rate": 1.322517230541096e-05, + "loss": 0.6146, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.3645058833020978, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6137, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.4015338086627751, + "learning_rate": 1.30539214797198e-05, + "loss": 0.6482, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.4148085919365177, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.6587, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.40470168724297817, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6203, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.4109070896313481, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.6397, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.45667782401349, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.6691, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.41273803679512366, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6089, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.39738829331285314, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.5557, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.4035938123952541, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6179, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3592535699415328, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.59, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.34395768642729996, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.624, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.37888093887754987, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.6166, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.42673984972854934, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.5848, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.4348623379382092, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.5819, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.4217765180561866, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.6488, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.3926592127503482, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6102, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.444535421100806, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.5667, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.37800782166550384, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.6631, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.484669796405072, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6771, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.34471682796148445, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.6357, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.40101464844953705, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.6635, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4584298369094714, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6146, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.4030602211795025, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.5997, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.4529669138975379, + "learning_rate": 1.123914688596409e-05, + "loss": 0.6184, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.39090019517918023, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6119, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.416318764923169, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.6376, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.5807650457041063, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.7556, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.47773024387151364, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6892, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.4268904491824666, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.6724, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.4339520738304605, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.6838, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.42365839653832277, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.645, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.45171866803310495, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.6252, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.41195121058189305, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.5671, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.40865072858341894, + "learning_rate": 1.045650195232819e-05, + "loss": 0.5746, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.36578120703551986, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.5613, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.36830492227380884, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.6237, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4927935605607071, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6696, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.42478793778815627, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.6476, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.39578411104719047, + "learning_rate": 1.007519208596045e-05, + "loss": 0.6248, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4106414831054956, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6353, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.3985521483275362, + "learning_rate": 9.924546254786493e-06, + "loss": 0.6148, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.444840586650571, + "learning_rate": 9.849626695403324e-06, + "loss": 0.6618, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.575592785908353, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7099, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.3683201876374113, + "learning_rate": 9.700595407649805e-06, + "loss": 0.603, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.4579511056092031, + "learning_rate": 9.62648412430951e-06, + "loss": 0.628, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.39842390606428885, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6429, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.3891734424738252, + "learning_rate": 9.479071385238892e-06, + "loss": 0.6085, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.39349054248607734, + "learning_rate": 9.40577036970538e-06, + "loss": 0.6449, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.42839801800170924, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6763, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.39235277618039877, + "learning_rate": 9.259980141081115e-06, + "loss": 0.639, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.3993459662079003, + "learning_rate": 9.187491363342093e-06, + "loss": 0.6433, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.41856979205822786, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6068, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.4034791766233207, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6116, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.40221452484703857, + "learning_rate": 8.971652971536148e-06, + "loss": 0.5847, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3967911364122253, + "learning_rate": 8.900250204211514e-06, + "loss": 0.5719, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.47801316808208466, + "learning_rate": 8.829119474567671e-06, + "loss": 0.6542, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.3986920049012869, + "learning_rate": 8.758260995011825e-06, + "loss": 0.6256, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.38939715834170985, + "learning_rate": 8.687674977138116e-06, + "loss": 0.5897, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.41777207997562515, + "learning_rate": 8.617361631727138e-06, + "loss": 0.6776, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.37259748507667356, + "learning_rate": 8.547321168745193e-06, + "loss": 0.6007, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.4736891450729395, + "learning_rate": 8.47755379734373e-06, + "loss": 0.647, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.43992339582496437, + "learning_rate": 8.408059725858719e-06, + "loss": 0.5661, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.44125070212055256, + "learning_rate": 8.338839161809997e-06, + "loss": 0.6368, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3545513928119534, + "learning_rate": 8.269892311900696e-06, + "loss": 0.5164, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.5070293760527375, + "learning_rate": 8.201219382016556e-06, + "loss": 0.6184, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.4904896591151178, + "learning_rate": 8.132820577225387e-06, + "loss": 0.6146, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3967245313182166, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6031, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.4736125983986299, + "learning_rate": 7.996846159099557e-06, + "loss": 0.7248, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.35345499941635394, + "learning_rate": 7.929270951805178e-06, + "loss": 0.5785, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.44186475929953245, + "learning_rate": 7.861970681683051e-06, + "loss": 0.5418, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.3808805874759711, + "learning_rate": 7.794945549701993e-06, + "loss": 0.6139, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.38242530597868213, + "learning_rate": 7.728195756009204e-06, + "loss": 0.5911, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.5134288879788514, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6435, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.6337127497375044, + "learning_rate": 7.595522979965819e-06, + "loss": 0.6515, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.42488961332511715, + "learning_rate": 7.529600393796232e-06, + "loss": 0.5878, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.5240689106181592, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7358, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.4718658727732926, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.5768, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.4401430668645281, + "learning_rate": 7.333490202478666e-06, + "loss": 0.6079, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.47047426174415424, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6162, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.4339177242243759, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6586, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.3599831842810963, + "learning_rate": 7.1398704525792e-06, + "loss": 0.5841, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4099964056909604, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5877, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.36751292737528646, + "learning_rate": 7.012176770311862e-06, + "loss": 0.6466, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.4165507690117416, + "learning_rate": 6.948746347689183e-06, + "loss": 0.6184, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.39162379094696625, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6243, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.4116873277237225, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.623, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.4210521732498824, + "learning_rate": 6.760123024328624e-06, + "loss": 0.6184, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4491235615447384, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6641, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.5129827043466911, + "learning_rate": 6.635765971293484e-06, + "loss": 0.7014, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.41375745180416124, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.5795, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.3943462641779985, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6885, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.41858767027493393, + "learning_rate": 6.451321849032288e-06, + "loss": 0.6441, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.42409503107067126, + "learning_rate": 6.390398932093555e-06, + "loss": 0.6457, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4231058612450969, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6785, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.3960780998009343, + "learning_rate": 6.269391876739495e-06, + "loss": 0.6123, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.3742043328992552, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6058, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.4736101999046393, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6489, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.46955758979360435, + "learning_rate": 6.089980943839924e-06, + "loss": 0.6478, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.7634695841735873, + "learning_rate": 6.030737921409169e-06, + "loss": 0.618, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4345938737690243, + "learning_rate": 5.971775505458444e-06, + "loss": 0.7012, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.40090791844884754, + "learning_rate": 5.913093872058528e-06, + "loss": 0.5792, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.4950215877037177, + "learning_rate": 5.854693196441641e-06, + "loss": 0.6744, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.3709300557238186, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6294, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.40671767753834, + "learning_rate": 5.738735415290642e-06, + "loss": 0.6353, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.4308083112903099, + "learning_rate": 5.681178656024055e-06, + "loss": 0.6227, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.4291744547709792, + "learning_rate": 5.623903547074549e-06, + "loss": 0.5853, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.4476219517321725, + "learning_rate": 5.566910259474289e-06, + "loss": 0.6051, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.45810225605925653, + "learning_rate": 5.510198963413881e-06, + "loss": 0.6894, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.36983347197203204, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6412, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.4293497945626058, + "learning_rate": 5.397623022464226e-06, + "loss": 0.6402, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.5074833976497126, + "learning_rate": 5.341758713743828e-06, + "loss": 0.6519, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.38837993512590113, + "learning_rate": 5.286177068899989e-06, + "loss": 0.5602, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.4493312016772392, + "learning_rate": 5.230878253907912e-06, + "loss": 0.6596, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.3745228625348613, + "learning_rate": 5.175862433898282e-06, + "loss": 0.6226, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.3926946003367579, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6605, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.4031266176315358, + "learning_rate": 5.066680435123106e-06, + "loss": 0.6182, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.46302782865461184, + "learning_rate": 5.012514582391592e-06, + "loss": 0.6093, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4023029515757742, + "learning_rate": 4.95863237670956e-06, + "loss": 0.5404, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.4345522125756673, + "learning_rate": 4.905033978977491e-06, + "loss": 0.5981, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.4018408074306652, + "learning_rate": 4.851719549248301e-06, + "loss": 0.6337, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.43128450719572176, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6143, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.4276601320324969, + "learning_rate": 4.745943229770122e-06, + "loss": 0.6236, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.4078760862656175, + "learning_rate": 4.693481655885257e-06, + "loss": 0.6019, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4146378365591098, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6047, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.38866854543260576, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6236, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.41247922047810026, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6462, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.32855235101787567, + "learning_rate": 4.486482911479839e-06, + "loss": 0.5745, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.3532860254011047, + "learning_rate": 4.435445885824285e-06, + "loss": 0.5772, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.3960725048956789, + "learning_rate": 4.384694230432984e-06, + "loss": 0.5992, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3577972714817226, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6381, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.4057899771618968, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.631, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.4259913592867645, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.623, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.46727949277813496, + "learning_rate": 4.184544329761009e-06, + "loss": 0.711, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.43382358933471343, + "learning_rate": 4.135221781914034e-06, + "loss": 0.6746, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.4050494645563292, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.5776, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.39697118504461587, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6343, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.41634897817309924, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6555, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.3592250048605172, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.5408, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.3943800527241845, + "learning_rate": 3.892905960127546e-06, + "loss": 0.5769, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.3899844201519674, + "learning_rate": 3.845303192289074e-06, + "loss": 0.574, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.42674209778020844, + "learning_rate": 3.797987556970495e-06, + "loss": 0.5894, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.44357777850984076, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6398, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.4264535946350911, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.6269, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.4433193047328518, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.6676, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4659300803923899, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6006, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.4280070596656568, + "learning_rate": 3.565721283350931e-06, + "loss": 0.6374, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.43230543719081344, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.6566, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4659420349151, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6639, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.4456625626122343, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6279, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.3314495580489305, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.5381, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.41115079767153645, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6671, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.37481290351521623, + "learning_rate": 3.296506110302422e-06, + "loss": 0.6031, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.34037181057619237, + "learning_rate": 3.252646840332918e-06, + "loss": 0.5686, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.46170967564430004, + "learning_rate": 3.209076472645112e-06, + "loss": 0.669, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.3790339373028516, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6123, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.3954633192139074, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.6229, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.433350783415259, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6782, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.4321933831959839, + "learning_rate": 3.037686613916857e-06, + "loss": 0.7233, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.4337168503950566, + "learning_rate": 2.995562691985898e-06, + "loss": 0.6596, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.4626165639483129, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.628, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.39743699028389945, + "learning_rate": 2.912183982969385e-06, + "loss": 0.6195, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.3519174310870393, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.5934, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.40373434760817883, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6688, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.4004436444904916, + "learning_rate": 2.789290617426765e-06, + "loss": 0.6117, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.45114807122599826, + "learning_rate": 2.748906571878207e-06, + "loss": 0.6581, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.42634126146566326, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6241, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.4408191682479045, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.7201, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.37813244503353843, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.5779, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.43341126939368974, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6321, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.4182754317940364, + "learning_rate": 2.551344823532964e-06, + "loss": 0.5721, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.38225025649677735, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.5936, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.40240998851253346, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.5944, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.499859620034534, + "learning_rate": 2.436298790049363e-06, + "loss": 0.6874, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.3739439660247912, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.5971, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.37083645714936647, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6019, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.457070008074725, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.6075, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.3940632112896435, + "learning_rate": 2.286983355164529e-06, + "loss": 0.608, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.45402641693876467, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7006, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.4380311216268415, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.6746, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.4253395986872956, + "learning_rate": 2.178060137750071e-06, + "loss": 0.5907, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.534865339392156, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6861, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.3965521221806843, + "learning_rate": 2.106905034576112e-06, + "loss": 0.5446, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.3638765529336139, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.6358, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.41302575114306894, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6443, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.3856587072635981, + "learning_rate": 2.002365067264289e-06, + "loss": 0.6731, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.4184150231637184, + "learning_rate": 1.968103545249611e-06, + "loss": 0.5624, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4129135611595348, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.652, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.41559419100228573, + "learning_rate": 1.900458817025097e-06, + "loss": 0.6524, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.43875770711272727, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.6379, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4235053002596906, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6376, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.43980229694413764, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.6442, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.44154118079189997, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.5812, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.4212359075612435, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.5912, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.3833256754274344, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.6238, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.3868226520224876, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.6138, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.4562045231270473, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.5786, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.4313624999644069, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.6279, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.3779659342751667, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.6687, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.43331408451293585, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6461, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.4561877150242593, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.6213, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.41568795408595843, + "learning_rate": 1.489364501100332e-06, + "loss": 0.5681, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.41666064890528937, + "learning_rate": 1.459798471131868e-06, + "loss": 0.608, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.35051443835931156, + "learning_rate": 1.430526697162482e-06, + "loss": 0.5398, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.43540457235463903, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.7033, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.4239455387270707, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.647, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.42480969329749607, + "learning_rate": 1.344477780953346e-06, + "loss": 0.619, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.4548466775430133, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.6261, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4474854245936277, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6213, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.4400101674067497, + "learning_rate": 1.261080262743297e-06, + "loss": 0.6145, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.3875863845126434, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.5775, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.4777366702556033, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6843, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.4866754091612041, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.6528, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.613569179778805, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.5573, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.40498745819952764, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6056, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.413474162763506, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.6025, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.44597027165955117, + "learning_rate": 1.076809502472831e-06, + "loss": 0.6104, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4064801311758751, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6437, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.41690486845907654, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6525, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.4516547281715239, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6222, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.43088246059304064, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6326, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.3937326241318572, + "learning_rate": 9.540479264726676e-07, + "loss": 0.6102, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.4179881256944634, + "learning_rate": 9.303826211592315e-07, + "loss": 0.6256, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.3752653805864915, + "learning_rate": 9.070131527609604e-07, + "loss": 0.5775, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.3374245664346441, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6107, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.39522870952333566, + "learning_rate": 8.611620049653879e-07, + "loss": 0.5733, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.4943357520654654, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6241, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.4213764358629093, + "learning_rate": 8.16495030759501e-07, + "loss": 0.5992, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.5586452322644974, + "learning_rate": 7.946057760332193e-07, + "loss": 0.5766, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3824768095593952, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6512, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.43902398729396896, + "learning_rate": 7.517160581569372e-07, + "loss": 0.6067, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.39133704391079804, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6296, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.39219399405215594, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6471, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.3877358568441335, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6099, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.39164164231563814, + "learning_rate": 6.694935631773258e-07, + "loss": 0.5876, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.403644328513689, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6149, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.5572673393766915, + "learning_rate": 6.301617681886863e-07, + "loss": 0.62, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.48844084356238204, + "learning_rate": 6.109409416834688e-07, + "loss": 0.6791, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4106253067003294, + "learning_rate": 5.920169059947411e-07, + "loss": 0.5834, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.5232318447915651, + "learning_rate": 5.733897176325665e-07, + "loss": 0.6561, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.4081165885957195, + "learning_rate": 5.550594322205504e-07, + "loss": 0.604, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.4384446679985943, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6374, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.3793257951017932, + "learning_rate": 5.192897883082747e-07, + "loss": 0.6427, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.37436951514651023, + "learning_rate": 5.018505366216175e-07, + "loss": 0.5791, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.6507421966514167, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6979, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.414392189643734, + "learning_rate": 4.678634341683252e-07, + "loss": 0.6493, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.3898163698700003, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6263, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.43892173194381723, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6251, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.4061868178940558, + "learning_rate": 4.191120373120749e-07, + "loss": 0.5776, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.561940286841165, + "learning_rate": 4.034562351727389e-07, + "loss": 0.5919, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4444289938582935, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.5448, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.43147072910826345, + "learning_rate": 3.73036907948543e-07, + "loss": 0.6355, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.37023411773954457, + "learning_rate": 3.582734737004101e-07, + "loss": 0.5794, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4362952651794109, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.669, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.3515469215393279, + "learning_rate": 3.296392843612273e-07, + "loss": 0.5957, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.4974092478566155, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.5985, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.3808216459432855, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6183, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.38782983278814614, + "learning_rate": 2.889203328748424e-07, + "loss": 0.645, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.3849222818790878, + "learning_rate": 2.759428007315212e-07, + "loss": 0.5895, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.43622022139414524, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6589, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.38740406391785887, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.548, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.3775418121317073, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.5708, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.37899514424739766, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6044, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.3952366771236291, + "learning_rate": 2.15522751523467e-07, + "loss": 0.6132, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.45795803936713125, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.5841, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.39120816038265205, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.5932, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.45997035320821184, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.6514, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.3990009793873229, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.6181, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4078619114901513, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6127, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.3757411604027671, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.6095, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.38505502173080025, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.6205, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.46882005313831376, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6354, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.4170014431891796, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6405, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.3966189547801234, + "learning_rate": 1.170343437301491e-07, + "loss": 0.6008, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3802848692980568, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5816, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.4275208205666537, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.6343, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.41248547085907605, + "learning_rate": 9.330275400666332e-08, + "loss": 0.5959, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.4283474840922225, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6231, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.4245547427079539, + "learning_rate": 7.8973337634336e-08, + "loss": 0.6201, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.3908314103669169, + "learning_rate": 7.225618800222877e-08, + "loss": 0.5737, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.34079404885288034, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5836, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.40286549864423876, + "learning_rate": 5.971710613821291e-08, + "loss": 0.5945, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.46596364619993086, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6385, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.3940174311468667, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6279, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.4203965134296234, + "learning_rate": 4.314680098592705e-08, + "loss": 0.6402, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.3774077358110915, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.6194, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.4008472539371659, + "learning_rate": 3.359233507459481e-08, + "loss": 0.566, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.4080999169238299, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.6448, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.4077745692946297, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.6125, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.38226107295436784, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6245, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.41375038124086927, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.5857, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.3966773367374224, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.6348, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.40442872978303074, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6214, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.38841286408602566, + "learning_rate": 9.555535917993297e-09, + "loss": 0.5974, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.46383341076347295, + "learning_rate": 7.315984495548378e-09, + "loss": 0.578, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.49119919317599087, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6599, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.41483896589092045, + "learning_rate": 3.732667443390181e-09, + "loss": 0.6455, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.3285919278758509, + "learning_rate": 2.388912514017516e-09, + "loss": 0.5492, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.47265575503752827, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6146, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.4001102881783127, + "learning_rate": 5.972299119250125e-10, + "loss": 0.626, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.3969808974787441, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.5833, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.4771903062703072, + "learning_rate": 0.0, + "loss": 0.6608, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1649201477156864.0, + "train_loss": 0.7004796615282695, + "train_runtime": 29310.7261, + "train_samples_per_second": 1.024, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1649201477156864.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0364082e66eba99ad87431531c6484171aba51ef --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "k_proj", + "up_proj", + "q_proj", + "o_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a015251aba2f3baf7f2ac1be2e37668039831dad --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad3b0810ecbe492df7ac9ea0a150f743913dc2e286ae5b069645bfac33125397 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..312bca50e02855ac65838f848b783490e0b1af5c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77ab988b2b75ddea21d2207ee190795b609eaebf993e7b5701d6fd564233b7b2 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..601aea95bb2089e62c16f16716bd520ab5e4b306 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 1.0020062019097136, + "learning_rate": 2e-05, + "loss": 1.4094, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 1.1672913328138943, + "learning_rate": 4e-05, + "loss": 1.5713, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8842418536610446, + "learning_rate": 6e-05, + "loss": 1.3336, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7848675740570894, + "learning_rate": 8e-05, + "loss": 1.1949, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.8436775517980575, + "learning_rate": 0.0001, + "loss": 1.1434, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.808923767814001, + "learning_rate": 0.00012, + "loss": 1.043, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.9265473165199244, + "learning_rate": 0.00014, + "loss": 1.0547, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.8782500227023723, + "learning_rate": 0.00016, + "loss": 1.0547, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.7012466031858726, + "learning_rate": 0.00018, + "loss": 0.9094, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5475305846513644, + "learning_rate": 0.0002, + "loss": 0.9137, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5616025381116986, + "learning_rate": 0.00019999458931878073, + "loss": 0.8534, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.587551632299327, + "learning_rate": 0.0001999783578606323, + "loss": 0.9994, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5028924581189982, + "learning_rate": 0.00019995130738201966, + "loss": 0.8336, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5466220925904238, + "learning_rate": 0.0001999134408101731, + "loss": 0.9528, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.5712862342882589, + "learning_rate": 0.00019986476224277165, + "loss": 0.9179, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4988168205894673, + "learning_rate": 0.00019980527694749952, + "loss": 0.8687, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5180030366003013, + "learning_rate": 0.00019973499136147606, + "loss": 0.9255, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.616942952759206, + "learning_rate": 0.0001996539130905593, + "loss": 0.9314, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5584932054266343, + "learning_rate": 0.0001995620509085228, + "loss": 0.9221, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.48240484153272345, + "learning_rate": 0.00019945941475610623, + "loss": 0.8666, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5009492975166543, + "learning_rate": 0.0001993460157399396, + "loss": 0.9445, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4991179929795562, + "learning_rate": 0.0001992218661313415, + "loss": 0.925, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.44584094654822, + "learning_rate": 0.00019908697936499103, + "loss": 0.8652, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.46242576323411977, + "learning_rate": 0.00019894137003747403, + "loss": 0.9334, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.45158839685216784, + "learning_rate": 0.00019878505390570362, + "loss": 0.8848, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.45338770941838574, + "learning_rate": 0.00019861804788521493, + "loss": 0.7919, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4692829974767608, + "learning_rate": 0.00019844037004833473, + "loss": 0.8308, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.49506573014670635, + "learning_rate": 0.00019825203962222572, + "loss": 0.872, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.46172406685438194, + "learning_rate": 0.0001980530769868059, + "loss": 0.8506, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.4520624065427708, + "learning_rate": 0.00019784350367254322, + "loss": 0.8671, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.5370489718418412, + "learning_rate": 0.0001976233423581255, + "loss": 0.9335, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4747014965518446, + "learning_rate": 0.0001973926168680066, + "loss": 0.8181, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5330780221340524, + "learning_rate": 0.00019715135216982798, + "loss": 0.8758, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4461456900225593, + "learning_rate": 0.0001968995743717171, + "loss": 0.853, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.42907229569763317, + "learning_rate": 0.00019663731071946206, + "loss": 0.8721, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.44077146618160284, + "learning_rate": 0.00019636458959356316, + "loss": 0.8609, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.41030197878298086, + "learning_rate": 0.0001960814405061619, + "loss": 0.8098, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.49557317411685103, + "learning_rate": 0.00019578789409784727, + "loss": 0.8831, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.44605271336641833, + "learning_rate": 0.00019548398213434007, + "loss": 0.8377, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.4826928471153355, + "learning_rate": 0.00019516973750305532, + "loss": 0.8205, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.44838532908845163, + "learning_rate": 0.00019484519420954354, + "loss": 0.7902, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4734508589182639, + "learning_rate": 0.00019451038737381077, + "loss": 0.8583, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.44919162382201094, + "learning_rate": 0.00019416535322651818, + "loss": 0.815, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4519943868115642, + "learning_rate": 0.00019381012910506146, + "loss": 0.8703, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.4229071333914242, + "learning_rate": 0.00019344475344953012, + "loss": 0.7932, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4738970902391505, + "learning_rate": 0.00019306926579854821, + "loss": 0.8451, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.41887929710668087, + "learning_rate": 0.00019268370678499533, + "loss": 0.7893, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4799740282974879, + "learning_rate": 0.0001922881181316097, + "loss": 0.8122, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.49665391720151236, + "learning_rate": 0.00019188254264647337, + "loss": 0.8565, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.42640442472050694, + "learning_rate": 0.0001914670242183795, + "loss": 0.8022, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5923745737555824, + "learning_rate": 0.0001910416078120832, + "loss": 0.8227, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5169313193580595, + "learning_rate": 0.0001906063394634356, + "loss": 0.8382, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.411955879890394, + "learning_rate": 0.00019016126627440237, + "loss": 0.8153, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4653962529308693, + "learning_rate": 0.00018970643640796642, + "loss": 0.8063, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.46401764632163195, + "learning_rate": 0.000189241899082916, + "loss": 0.8618, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5098831978733065, + "learning_rate": 0.00018876770456851877, + "loss": 0.8702, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4550613856493534, + "learning_rate": 0.0001882839041790818, + "loss": 0.7666, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4334229661431585, + "learning_rate": 0.00018779055026839868, + "loss": 0.8294, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5014227971036841, + "learning_rate": 0.00018728769622408423, + "loss": 0.7827, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.44130053629511323, + "learning_rate": 0.00018677539646179707, + "loss": 0.8197, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4293140157957783, + "learning_rate": 0.00018625370641935129, + "loss": 0.7983, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4594223463930405, + "learning_rate": 0.00018572268255071718, + "loss": 0.8582, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4141932528344203, + "learning_rate": 0.00018518238231991218, + "loss": 0.842, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.39826030960177344, + "learning_rate": 0.00018463286419478255, + "loss": 0.7777, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.4629047201613823, + "learning_rate": 0.00018407418764067627, + "loss": 0.8507, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5035872872686349, + "learning_rate": 0.00018350641311400812, + "loss": 0.8771, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4890199906535899, + "learning_rate": 0.0001829296020557174, + "loss": 0.8973, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.45734084694048316, + "learning_rate": 0.00018234381688461942, + "loss": 0.7985, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4622158365476711, + "learning_rate": 0.0001817491209906506, + "loss": 0.8644, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.4043135056245759, + "learning_rate": 0.00018114557872800905, + "loss": 0.7464, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4043710195936969, + "learning_rate": 0.00018053325540819045, + "loss": 0.7411, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.48797963423635654, + "learning_rate": 0.0001799122172929206, + "loss": 0.9087, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4514538357948917, + "learning_rate": 0.00017928253158698473, + "loss": 0.8023, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.42213380789547084, + "learning_rate": 0.0001786442664309554, + "loss": 0.7816, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.4661438499135115, + "learning_rate": 0.0001779974908938184, + "loss": 0.808, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5232351498258501, + "learning_rate": 0.0001773422749654988, + "loss": 0.9008, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3975491846434713, + "learning_rate": 0.00017667868954928694, + "loss": 0.7479, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.44571590233114655, + "learning_rate": 0.00017600680645416583, + "loss": 0.8146, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4178992442982215, + "learning_rate": 0.00017532669838704035, + "loss": 0.8273, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.4530115017535405, + "learning_rate": 0.00017463843894486937, + "loss": 0.8645, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4886539415071477, + "learning_rate": 0.0001739421026067017, + "loss": 0.788, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4056803818730166, + "learning_rate": 0.00017323776472561627, + "loss": 0.8116, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.39073317913364825, + "learning_rate": 0.00017252550152056795, + "loss": 0.7638, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4688396587663738, + "learning_rate": 0.0001718053900681397, + "loss": 0.7963, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.5008490816743745, + "learning_rate": 0.00017107750829420176, + "loss": 0.8696, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.41439579567404433, + "learning_rate": 0.00017034193496547902, + "loss": 0.7953, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5384060646901033, + "learning_rate": 0.00016959874968102735, + "loss": 0.85, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.43573649346247634, + "learning_rate": 0.00016884803286362, + "loss": 0.7754, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5280762687540471, + "learning_rate": 0.00016808986575104465, + "loss": 0.7931, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.45320274465650234, + "learning_rate": 0.00016732433038731242, + "loss": 0.7874, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.3980614452364451, + "learning_rate": 0.0001665515096137797, + "loss": 0.7328, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5159807437944037, + "learning_rate": 0.00016577148706018328, + "loss": 0.8251, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4798140262593944, + "learning_rate": 0.00016498434713559088, + "loss": 0.7757, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4452205576856887, + "learning_rate": 0.00016419017501926656, + "loss": 0.7538, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.3745168487175859, + "learning_rate": 0.0001633890566514535, + "loss": 0.7036, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.5074584143384333, + "learning_rate": 0.00016258107872407375, + "loss": 0.804, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.43561945612065134, + "learning_rate": 0.0001617663286713474, + "loss": 0.7436, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4848632241795599, + "learning_rate": 0.00016094489466033043, + "loss": 0.8231, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5139486642178871, + "learning_rate": 0.00016011686558137448, + "loss": 0.7744, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.5522899077467178, + "learning_rate": 0.0001592823310385073, + "loss": 0.9019, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.47312384533831275, + "learning_rate": 0.0001584413813397364, + "loss": 0.8012, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.46402196649732563, + "learning_rate": 0.00015759410748727662, + "loss": 0.8558, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.41689457770001886, + "learning_rate": 0.00015674060116770236, + "loss": 0.7365, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.4012615086815032, + "learning_rate": 0.00015588095474202595, + "loss": 0.7223, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.38947271506208403, + "learning_rate": 0.00015501526123570277, + "loss": 0.7767, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.43740220066952523, + "learning_rate": 0.00015414361432856475, + "loss": 0.7461, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.43193908663600816, + "learning_rate": 0.0001532661083446829, + "loss": 0.7846, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4301344958314146, + "learning_rate": 0.00015238283824216015, + "loss": 0.7225, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.43533146434585573, + "learning_rate": 0.00015149389960285558, + "loss": 0.8807, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.4237347133491233, + "learning_rate": 0.00015059938862204127, + "loss": 0.7783, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.40306999298900875, + "learning_rate": 0.00014969940209799248, + "loss": 0.7687, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3986230491682654, + "learning_rate": 0.00014879403742151283, + "loss": 0.7666, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4994081964028108, + "learning_rate": 0.00014788339256539544, + "loss": 0.8079, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4344344140485525, + "learning_rate": 0.0001469675660738206, + "loss": 0.8157, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.5043695566357764, + "learning_rate": 0.00014604665705169237, + "loss": 0.8553, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.45373707073050146, + "learning_rate": 0.00014512076515391375, + "loss": 0.7418, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5003550553196944, + "learning_rate": 0.00014418999057460276, + "loss": 0.8143, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.43104315436973506, + "learning_rate": 0.0001432544340362501, + "loss": 0.7843, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4360808885133991, + "learning_rate": 0.00014231419677881966, + "loss": 0.801, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.4413032958474301, + "learning_rate": 0.00014136938054879283, + "loss": 0.8097, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.37330247661022875, + "learning_rate": 0.00014042008758815818, + "loss": 0.6842, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.44689400956413305, + "learning_rate": 0.00013946642062334766, + "loss": 0.8236, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4503572306579569, + "learning_rate": 0.00013850848285411994, + "loss": 0.7274, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3883114965737896, + "learning_rate": 0.000137546377942393, + "loss": 0.7537, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.5153659790762082, + "learning_rate": 0.00013658021000102636, + "loss": 0.8219, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.40559233592673544, + "learning_rate": 0.00013561008358255468, + "loss": 0.7438, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.38508116038201756, + "learning_rate": 0.00013463610366787392, + "loss": 0.7713, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.44845430495824773, + "learning_rate": 0.00013365837565488064, + "loss": 0.7845, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.5045862534919716, + "learning_rate": 0.0001326770053470668, + "loss": 0.779, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.45811118619528685, + "learning_rate": 0.0001316920989420703, + "loss": 0.7595, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.554927213995397, + "learning_rate": 0.00013070376302018287, + "loss": 0.7426, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.45414281431875614, + "learning_rate": 0.00012971210453281674, + "loss": 0.7745, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4180034067605168, + "learning_rate": 0.000128717230790931, + "loss": 0.7423, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.430146230620113, + "learning_rate": 0.00012771924945341906, + "loss": 0.7587, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.39963289577534084, + "learning_rate": 0.00012671826851545851, + "loss": 0.7324, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4398823647567502, + "learning_rate": 0.0001257143962968246, + "loss": 0.7728, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4141436944414363, + "learning_rate": 0.00012470774143016853, + "loss": 0.7746, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.44479075402997625, + "learning_rate": 0.00012369841284926188, + "loss": 0.7397, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4285177243435202, + "learning_rate": 0.00012268651977720866, + "loss": 0.8109, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.4411044029778361, + "learning_rate": 0.00012167217171462566, + "loss": 0.8116, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4137959768828525, + "learning_rate": 0.0001206554784277931, + "loss": 0.7417, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.439364759321868, + "learning_rate": 0.00011963654993677645, + "loss": 0.7878, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4489862198564562, + "learning_rate": 0.00011861549650352069, + "loss": 0.7126, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.48578216440407823, + "learning_rate": 0.00011759242861991855, + "loss": 0.7635, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.45022516264972906, + "learning_rate": 0.00011656745699585371, + "loss": 0.7899, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.40872955925173193, + "learning_rate": 0.00011554069254722051, + "loss": 0.7341, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.42586202092150144, + "learning_rate": 0.00011451224638392129, + "loss": 0.7314, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4171292822856141, + "learning_rate": 0.00011348222979784289, + "loss": 0.7595, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.480380131441867, + "learning_rate": 0.00011245075425081328, + "loss": 0.812, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.40526609693194515, + "learning_rate": 0.00011141793136253986, + "loss": 0.769, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3840310308424327, + "learning_rate": 0.0001103838728985307, + "loss": 0.719, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3752893256883143, + "learning_rate": 0.000109348690758, + "loss": 0.8175, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.42208306169316506, + "learning_rate": 0.00010831249696175918, + "loss": 0.7695, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4708087065120164, + "learning_rate": 0.0001072754036400944, + "loss": 0.7581, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.43669467431527925, + "learning_rate": 0.00010623752302063283, + "loss": 0.8088, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3694687014511646, + "learning_rate": 0.00010519896741619803, + "loss": 0.7467, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.40360189905265087, + "learning_rate": 0.00010415984921265609, + "loss": 0.7809, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.5168717922014412, + "learning_rate": 0.00010312028085675391, + "loss": 0.7352, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3671113041044409, + "learning_rate": 0.00010208037484395114, + "loss": 0.6766, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.45002238461802757, + "learning_rate": 0.00010104024370624644, + "loss": 0.8266, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4309770941080891, + "learning_rate": 0.0001, + "loss": 0.7716, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.38193922359229104, + "learning_rate": 9.895975629375359e-05, + "loss": 0.6886, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.47255435453422723, + "learning_rate": 9.791962515604887e-05, + "loss": 0.8111, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.41050966292043334, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7198, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.40167619297449497, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7607, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.43591555439181373, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7679, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.39375915423090696, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7431, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3911415923341417, + "learning_rate": 9.272459635990562e-05, + "loss": 0.748, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.37806000003935913, + "learning_rate": 9.168750303824084e-05, + "loss": 0.718, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.47918580480180134, + "learning_rate": 9.065130924199998e-05, + "loss": 0.769, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.456254451883631, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7987, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.45635188698492374, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7745, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4062422613227234, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7558, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.44786737175933516, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7439, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.4420163721822703, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7468, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4366403785449747, + "learning_rate": 8.445930745277953e-05, + "loss": 0.8282, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.44160567802460143, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7693, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3950052585958004, + "learning_rate": 8.240757138008149e-05, + "loss": 0.754, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4864733609910337, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7768, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.4203970972293743, + "learning_rate": 8.036345006322359e-05, + "loss": 0.802, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.36364201707045174, + "learning_rate": 7.934452157220694e-05, + "loss": 0.6714, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3773423440015053, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7531, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4122032139585757, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7392, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4060138041258608, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7826, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.4500005144010625, + "learning_rate": 7.52922585698315e-05, + "loss": 0.6901, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.37550186795184387, + "learning_rate": 7.428560370317542e-05, + "loss": 0.6867, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.451039864858227, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7846, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.38497885909127655, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7088, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.5154266123974721, + "learning_rate": 7.1282769209069e-05, + "loss": 0.8177, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.3703640026584789, + "learning_rate": 7.028789546718326e-05, + "loss": 0.6701, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4386400051645292, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7671, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.402822326738162, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7669, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.43335857320343696, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7568, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.44062106618622304, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7122, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.46213988006162576, + "learning_rate": 6.536389633212609e-05, + "loss": 0.7027, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4508439790652391, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7829, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4830750076650203, + "learning_rate": 6.341978999897365e-05, + "loss": 0.8339, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4822485380332219, + "learning_rate": 6.245362205760704e-05, + "loss": 0.704, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3450710872496977, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7296, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.39481863420916286, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6931, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.49669230264861525, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7934, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4154651254147214, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7512, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4602137806235495, + "learning_rate": 5.768580322118034e-05, + "loss": 0.713, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.39711180804932905, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7526, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.38725643050847247, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.7804, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.38570099899258914, + "learning_rate": 5.487923484608629e-05, + "loss": 0.763, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.41759594142356127, + "learning_rate": 5.395334294830765e-05, + "loss": 0.6926, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4784349568071053, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7353, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3642809349585427, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7078, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.42279399570363735, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7707, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4232770458907553, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7892, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.44417509587658666, + "learning_rate": 4.940061137795876e-05, + "loss": 0.8029, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.4112498099064009, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7499, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.38248075464712855, + "learning_rate": 4.761716175783989e-05, + "loss": 0.694, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.388512224349573, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7339, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4493476842881038, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7018, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4022224547119, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7408, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4416873626755746, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7273, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.33804220900980275, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7121, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.3567832952479447, + "learning_rate": 4.240589251272342e-05, + "loss": 0.6917, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.4449544370633313, + "learning_rate": 4.155861866026364e-05, + "loss": 0.8005, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.447920701823006, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7294, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3731730633660234, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6956, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.42750086388493475, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7655, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.3920466926065124, + "learning_rate": 3.823367132865265e-05, + "loss": 0.6975, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3611682156930113, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7159, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.41926361819237346, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7275, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.41971660784876186, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7545, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.7413673275066183, + "learning_rate": 3.501565286440914e-05, + "loss": 0.673, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.3806863118171017, + "learning_rate": 3.422851293981676e-05, + "loss": 0.6866, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.41477013408202706, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7455, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3876082407382047, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7878, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.41300439564787, + "learning_rate": 3.191013424895536e-05, + "loss": 0.6675, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.36066932638629273, + "learning_rate": 3.115196713638e-05, + "loss": 0.7172, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.37561012170468494, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7123, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4408534490279242, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.779, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4389182520268332, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7727, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.44084624793958344, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7353, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.45965825857016396, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.8268, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.40486898098610974, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7036, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3831590332909565, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.6851, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.6399623309462235, + "learning_rate": 2.536156105513062e-05, + "loss": 0.8118, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3955238681930386, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6972, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4022287665333359, + "learning_rate": 2.399319354583418e-05, + "loss": 0.733, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.4298350782931847, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7647, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.49694477718490887, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7549, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.42114719991142247, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7375, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3809928321742791, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7021, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3798248954210015, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7627, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.4004290153653874, + "learning_rate": 2.008778270707944e-05, + "loss": 0.731, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.40098733120968666, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7532, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.39095986270735283, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.687, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4113257164040196, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7505, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.394684889161784, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.7308, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.41218562414416526, + "learning_rate": 1.707039794428259e-05, + "loss": 0.682, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.5006514104155203, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7833, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.40635351115200874, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6955, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3816074195458233, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7069, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4225561882163358, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7832, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.41080313956124365, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.714, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.41574441763530995, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7687, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.458118134115343, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.718, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4244817211567879, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7004, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.409980416870785, + "learning_rate": 1.220944973160133e-05, + "loss": 0.6762, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.4303430200185168, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6777, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5187693825788201, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7903, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.43812839922116, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.731, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.4613001122623699, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.6949, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.42749542124196754, + "learning_rate": 9.838733725597615e-06, + "loss": 0.748, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.40464163859777, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7253, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.39804294526476447, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7374, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4266101798907561, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7215, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3885077025405382, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7357, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4846915505912817, + "learning_rate": 7.711881868390291e-06, + "loss": 0.8117, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.452520369015205, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7179, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4184692167854737, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6905, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.420499872484043, + "learning_rate": 6.555246550469907e-06, + "loss": 0.6931, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4251946133856474, + "learning_rate": 6.189870894938587e-06, + "loss": 0.6707, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4272730684539094, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7537, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.6309998230648284, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7164, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3862301256612526, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7741, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3931981740744817, + "learning_rate": 4.830262496944693e-06, + "loss": 0.6871, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.37541414814492263, + "learning_rate": 4.516017865659949e-06, + "loss": 0.6975, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4028054962453646, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7087, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.37853946883499306, + "learning_rate": 3.918559493838114e-06, + "loss": 0.73, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.5788423755412683, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7262, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.34013834265448084, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.667, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.43818614152472873, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7841, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.35493850044501696, + "learning_rate": 2.848647830172024e-06, + "loss": 0.6632, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.4196389310359502, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7426, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.41823350521904995, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7649, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.7150743902715252, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7215, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.45743484487287117, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.804, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4112747257690636, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.69, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.4095135251036486, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7374, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.5067443148357712, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7021, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3598532464586451, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7058, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4433523684821114, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6705, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4160728625035564, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7909, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.41033752228612996, + "learning_rate": 7.781338686584927e-07, + "loss": 0.6993, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4877390319538481, + "learning_rate": 6.539842600603918e-07, + "loss": 0.631, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.44045946935661956, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7106, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.45442637015135684, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7077, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4241902847444021, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6472, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.3707542425270281, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7192, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3856152515473988, + "learning_rate": 1.947230525005006e-07, + "loss": 0.6616, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3917590365344406, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7024, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4543793555941507, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7265, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3625434659433941, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.6541, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.4035496798827605, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7096, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3782322554766045, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6533, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4651090593785289, + "learning_rate": 0.0, + "loss": 0.7347, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 271218014191616.0, + "train_loss": 0.783779461414386, + "train_runtime": 4880.0845, + "train_samples_per_second": 1.025, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 271218014191616.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f7cf87d675e08341bc5385e4fd15960420adf621 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "q_proj", + "down_proj", + "v_proj", + "o_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..618c34f4b39b838fa33e06044a8d7fd38bcb5115 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a98696296054cee5685487066ca076077023d0606ae9c626dfb0e2c99c2ef33 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..268844bfc521d01270d9785d933ed2a8e7dfdc4e --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a69debb061221fb65c4c29b26cee8d8c34f90d50e87801d857baad12862433f +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..02792653adc727c1ee1a33fef301f301358cd438 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,1134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064, + "grad_norm": 1.002639836398211, + "learning_rate": 4e-05, + "loss": 1.4904, + "step": 1 + }, + { + "epoch": 0.0128, + "grad_norm": 0.9148778845307886, + "learning_rate": 8e-05, + "loss": 1.3615, + "step": 2 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6218126570709165, + "learning_rate": 0.00012, + "loss": 1.2636, + "step": 3 + }, + { + "epoch": 0.0256, + "grad_norm": 1.1733819649415596, + "learning_rate": 0.00016, + "loss": 1.2422, + "step": 4 + }, + { + "epoch": 0.032, + "grad_norm": 0.8961458294612699, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 5 + }, + { + "epoch": 0.0384, + "grad_norm": 0.7104478974141764, + "learning_rate": 0.0001999783578606323, + "loss": 1.0069, + "step": 6 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4453839306748887, + "learning_rate": 0.0001999134408101731, + "loss": 0.9213, + "step": 7 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4364578353790173, + "learning_rate": 0.00019980527694749952, + "loss": 0.9275, + "step": 8 + }, + { + "epoch": 0.0576, + "grad_norm": 0.46386602430694124, + "learning_rate": 0.0001996539130905593, + "loss": 0.9556, + "step": 9 + }, + { + "epoch": 0.064, + "grad_norm": 0.4390379657400523, + "learning_rate": 0.00019945941475610623, + "loss": 0.925, + "step": 10 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4562302929225185, + "learning_rate": 0.0001992218661313415, + "loss": 0.9569, + "step": 11 + }, + { + "epoch": 0.0768, + "grad_norm": 0.39341154607993345, + "learning_rate": 0.00019894137003747403, + "loss": 0.9187, + "step": 12 + }, + { + "epoch": 0.0832, + "grad_norm": 0.38162793658742256, + "learning_rate": 0.00019861804788521493, + "loss": 0.8549, + "step": 13 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4132867175537064, + "learning_rate": 0.00019825203962222572, + "loss": 0.8685, + "step": 14 + }, + { + "epoch": 0.096, + "grad_norm": 0.40801041892631884, + "learning_rate": 0.00019784350367254322, + "loss": 0.878, + "step": 15 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3705814731127435, + "learning_rate": 0.0001973926168680066, + "loss": 0.8901, + "step": 16 + }, + { + "epoch": 0.1088, + "grad_norm": 0.36330646232631925, + "learning_rate": 0.0001968995743717171, + "loss": 0.8798, + "step": 17 + }, + { + "epoch": 0.1152, + "grad_norm": 0.33080409264465793, + "learning_rate": 0.00019636458959356316, + "loss": 0.8781, + "step": 18 + }, + { + "epoch": 0.1216, + "grad_norm": 0.32818135077853167, + "learning_rate": 0.00019578789409784727, + "loss": 0.8538, + "step": 19 + }, + { + "epoch": 0.128, + "grad_norm": 0.3362079631555025, + "learning_rate": 0.00019516973750305532, + "loss": 0.8387, + "step": 20 + }, + { + "epoch": 0.1344, + "grad_norm": 0.33995378997861403, + "learning_rate": 0.00019451038737381077, + "loss": 0.8369, + "step": 21 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3917103689027093, + "learning_rate": 0.00019381012910506146, + "loss": 0.8547, + "step": 22 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3348872695789784, + "learning_rate": 0.00019306926579854821, + "loss": 0.8289, + "step": 23 + }, + { + "epoch": 0.1536, + "grad_norm": 0.35776545677090854, + "learning_rate": 0.0001922881181316097, + "loss": 0.807, + "step": 24 + }, + { + "epoch": 0.16, + "grad_norm": 0.3334221740476462, + "learning_rate": 0.0001914670242183795, + "loss": 0.8377, + "step": 25 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4039360148046414, + "learning_rate": 0.0001906063394634356, + "loss": 0.8377, + "step": 26 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3214933683269574, + "learning_rate": 0.00018970643640796642, + "loss": 0.8202, + "step": 27 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3609707150993955, + "learning_rate": 0.00018876770456851877, + "loss": 0.8722, + "step": 28 + }, + { + "epoch": 0.1856, + "grad_norm": 0.31292033520753265, + "learning_rate": 0.00018779055026839868, + "loss": 0.7987, + "step": 29 + }, + { + "epoch": 0.192, + "grad_norm": 0.32661815417153717, + "learning_rate": 0.00018677539646179707, + "loss": 0.7988, + "step": 30 + }, + { + "epoch": 0.1984, + "grad_norm": 0.32009062981637354, + "learning_rate": 0.00018572268255071718, + "loss": 0.8287, + "step": 31 + }, + { + "epoch": 0.2048, + "grad_norm": 0.29528010997310794, + "learning_rate": 0.00018463286419478255, + "loss": 0.8102, + "step": 32 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3446237393250682, + "learning_rate": 0.00018350641311400812, + "loss": 0.8627, + "step": 33 + }, + { + "epoch": 0.2176, + "grad_norm": 0.34109389969001735, + "learning_rate": 0.00018234381688461942, + "loss": 0.8496, + "step": 34 + }, + { + "epoch": 0.224, + "grad_norm": 0.3162295461366009, + "learning_rate": 0.00018114557872800905, + "loss": 0.7985, + "step": 35 + }, + { + "epoch": 0.2304, + "grad_norm": 0.35365519160873493, + "learning_rate": 0.0001799122172929206, + "loss": 0.8226, + "step": 36 + }, + { + "epoch": 0.2368, + "grad_norm": 0.33091699840279165, + "learning_rate": 0.0001786442664309554, + "loss": 0.7905, + "step": 37 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3611309922925739, + "learning_rate": 0.0001773422749654988, + "loss": 0.8525, + "step": 38 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3130692668968462, + "learning_rate": 0.00017600680645416583, + "loss": 0.7838, + "step": 39 + }, + { + "epoch": 0.256, + "grad_norm": 0.3078365535973117, + "learning_rate": 0.00017463843894486937, + "loss": 0.8411, + "step": 40 + }, + { + "epoch": 0.2624, + "grad_norm": 0.33350693683260196, + "learning_rate": 0.00017323776472561627, + "loss": 0.8001, + "step": 41 + }, + { + "epoch": 0.2688, + "grad_norm": 0.31163537934096214, + "learning_rate": 0.0001718053900681397, + "loss": 0.7737, + "step": 42 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3408303214049586, + "learning_rate": 0.00017034193496547902, + "loss": 0.8311, + "step": 43 + }, + { + "epoch": 0.2816, + "grad_norm": 0.319324969812128, + "learning_rate": 0.00016884803286362, + "loss": 0.8081, + "step": 44 + }, + { + "epoch": 0.288, + "grad_norm": 0.3296644378182685, + "learning_rate": 0.00016732433038731242, + "loss": 0.788, + "step": 45 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3381874837565884, + "learning_rate": 0.00016577148706018328, + "loss": 0.7745, + "step": 46 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3439330013144162, + "learning_rate": 0.00016419017501926656, + "loss": 0.7663, + "step": 47 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3395715333488213, + "learning_rate": 0.00016258107872407375, + "loss": 0.7516, + "step": 48 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3475199659571279, + "learning_rate": 0.00016094489466033043, + "loss": 0.7809, + "step": 49 + }, + { + "epoch": 0.32, + "grad_norm": 0.3689529758365805, + "learning_rate": 0.0001592823310385073, + "loss": 0.8353, + "step": 50 + }, + { + "epoch": 0.3264, + "grad_norm": 0.32978099329172, + "learning_rate": 0.00015759410748727662, + "loss": 0.8185, + "step": 51 + }, + { + "epoch": 0.3328, + "grad_norm": 0.31695533987483404, + "learning_rate": 0.00015588095474202595, + "loss": 0.7293, + "step": 52 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3002706071495285, + "learning_rate": 0.00015414361432856475, + "loss": 0.7607, + "step": 53 + }, + { + "epoch": 0.3456, + "grad_norm": 0.31072433005235056, + "learning_rate": 0.00015238283824216015, + "loss": 0.7501, + "step": 54 + }, + { + "epoch": 0.352, + "grad_norm": 0.3188597094696028, + "learning_rate": 0.00015059938862204127, + "loss": 0.8266, + "step": 55 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3011529492928554, + "learning_rate": 0.00014879403742151283, + "loss": 0.7653, + "step": 56 + }, + { + "epoch": 0.3648, + "grad_norm": 0.33407894117308795, + "learning_rate": 0.0001469675660738206, + "loss": 0.8106, + "step": 57 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3459865222149195, + "learning_rate": 0.00014512076515391375, + "loss": 0.7883, + "step": 58 + }, + { + "epoch": 0.3776, + "grad_norm": 0.30487947849131897, + "learning_rate": 0.0001432544340362501, + "loss": 0.7963, + "step": 59 + }, + { + "epoch": 0.384, + "grad_norm": 0.31576280319394795, + "learning_rate": 0.00014136938054879283, + "loss": 0.8017, + "step": 60 + }, + { + "epoch": 0.3904, + "grad_norm": 0.2983397534698475, + "learning_rate": 0.00013946642062334766, + "loss": 0.7508, + "step": 61 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3020225486456565, + "learning_rate": 0.000137546377942393, + "loss": 0.7391, + "step": 62 + }, + { + "epoch": 0.4032, + "grad_norm": 0.2991633849034537, + "learning_rate": 0.00013561008358255468, + "loss": 0.7753, + "step": 63 + }, + { + "epoch": 0.4096, + "grad_norm": 0.31060377997959776, + "learning_rate": 0.00013365837565488064, + "loss": 0.7797, + "step": 64 + }, + { + "epoch": 0.416, + "grad_norm": 0.3634309228876997, + "learning_rate": 0.0001316920989420703, + "loss": 0.7659, + "step": 65 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3603104135897214, + "learning_rate": 0.00012971210453281674, + "loss": 0.7476, + "step": 66 + }, + { + "epoch": 0.4288, + "grad_norm": 0.31528714964980065, + "learning_rate": 0.00012771924945341906, + "loss": 0.7504, + "step": 67 + }, + { + "epoch": 0.4352, + "grad_norm": 0.29930350113657345, + "learning_rate": 0.0001257143962968246, + "loss": 0.752, + "step": 68 + }, + { + "epoch": 0.4416, + "grad_norm": 0.289895410892452, + "learning_rate": 0.00012369841284926188, + "loss": 0.7543, + "step": 69 + }, + { + "epoch": 0.448, + "grad_norm": 0.3179723569904245, + "learning_rate": 0.00012167217171462566, + "loss": 0.8121, + "step": 70 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3022004251250754, + "learning_rate": 0.00011963654993677645, + "loss": 0.7608, + "step": 71 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3175969070633314, + "learning_rate": 0.00011759242861991855, + "loss": 0.7423, + "step": 72 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3051849195123186, + "learning_rate": 0.00011554069254722051, + "loss": 0.7585, + "step": 73 + }, + { + "epoch": 0.4736, + "grad_norm": 0.31862225113793446, + "learning_rate": 0.00011348222979784289, + "loss": 0.7477, + "step": 74 + }, + { + "epoch": 0.48, + "grad_norm": 0.3262675703408152, + "learning_rate": 0.00011141793136253986, + "loss": 0.7909, + "step": 75 + }, + { + "epoch": 0.4864, + "grad_norm": 0.28143259427212386, + "learning_rate": 0.000109348690758, + "loss": 0.7668, + "step": 76 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3273677507239454, + "learning_rate": 0.0001072754036400944, + "loss": 0.7539, + "step": 77 + }, + { + "epoch": 0.4992, + "grad_norm": 0.29009024969694625, + "learning_rate": 0.00010519896741619803, + "loss": 0.7765, + "step": 78 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3317167312872358, + "learning_rate": 0.00010312028085675391, + "loss": 0.7575, + "step": 79 + }, + { + "epoch": 0.512, + "grad_norm": 0.30380303061938135, + "learning_rate": 0.00010104024370624644, + "loss": 0.7462, + "step": 80 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2917701370546009, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7377, + "step": 81 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3395467838004645, + "learning_rate": 9.687971914324607e-05, + "loss": 0.773, + "step": 82 + }, + { + "epoch": 0.5312, + "grad_norm": 0.29853428274769916, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7595, + "step": 83 + }, + { + "epoch": 0.5376, + "grad_norm": 0.2945108725430035, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7495, + "step": 84 + }, + { + "epoch": 0.544, + "grad_norm": 0.32639884964875193, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7449, + "step": 85 + }, + { + "epoch": 0.5504, + "grad_norm": 0.33706708071043895, + "learning_rate": 8.858206863746018e-05, + "loss": 0.786, + "step": 86 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3479230154717327, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7484, + "step": 87 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3101170670613661, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7877, + "step": 88 + }, + { + "epoch": 0.5696, + "grad_norm": 0.29316190913483436, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7623, + "step": 89 + }, + { + "epoch": 0.576, + "grad_norm": 0.33768620036386193, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7881, + "step": 90 + }, + { + "epoch": 0.5824, + "grad_norm": 0.2905393513774914, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7145, + "step": 91 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3043993112462114, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7615, + "step": 92 + }, + { + "epoch": 0.5952, + "grad_norm": 0.2834334048638075, + "learning_rate": 7.428560370317542e-05, + "loss": 0.6896, + "step": 93 + }, + { + "epoch": 0.6016, + "grad_norm": 0.30493595189203726, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7528, + "step": 94 + }, + { + "epoch": 0.608, + "grad_norm": 0.3092316178546374, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7499, + "step": 95 + }, + { + "epoch": 0.6144, + "grad_norm": 0.30722136255670696, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7695, + "step": 96 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3083405078773032, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7384, + "step": 97 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3159443481603721, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7514, + "step": 98 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3450221012225604, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7751, + "step": 99 + }, + { + "epoch": 0.64, + "grad_norm": 0.2733089025692975, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7175, + "step": 100 + }, + { + "epoch": 0.6464, + "grad_norm": 0.33594301474514837, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7783, + "step": 101 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3180809001952563, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7382, + "step": 102 + }, + { + "epoch": 0.6592, + "grad_norm": 0.28901339983137636, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7788, + "step": 103 + }, + { + "epoch": 0.6656, + "grad_norm": 0.31126724419239143, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7217, + "step": 104 + }, + { + "epoch": 0.672, + "grad_norm": 0.3009274137359368, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7471, + "step": 105 + }, + { + "epoch": 0.6784, + "grad_norm": 0.31539695546289276, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7997, + "step": 106 + }, + { + "epoch": 0.6848, + "grad_norm": 0.286607187152073, + "learning_rate": 4.761716175783989e-05, + "loss": 0.729, + "step": 107 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3018808665145025, + "learning_rate": 4.585638567143529e-05, + "loss": 0.724, + "step": 108 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3127535971736329, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7455, + "step": 109 + }, + { + "epoch": 0.704, + "grad_norm": 0.26012111785159153, + "learning_rate": 4.240589251272342e-05, + "loss": 0.707, + "step": 110 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3051050713891758, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7723, + "step": 111 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3118804782861242, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.737, + "step": 112 + }, + { + "epoch": 0.7232, + "grad_norm": 0.2737164240705061, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7149, + "step": 113 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3134425223786983, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7469, + "step": 114 + }, + { + "epoch": 0.736, + "grad_norm": 0.2818906727208132, + "learning_rate": 3.422851293981676e-05, + "loss": 0.6867, + "step": 115 + }, + { + "epoch": 0.7424, + "grad_norm": 0.29056613377962315, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7748, + "step": 116 + }, + { + "epoch": 0.7488, + "grad_norm": 0.2972352441395284, + "learning_rate": 3.115196713638e-05, + "loss": 0.7007, + "step": 117 + }, + { + "epoch": 0.7552, + "grad_norm": 0.30985546794945074, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7526, + "step": 118 + }, + { + "epoch": 0.7616, + "grad_norm": 0.34058599695364866, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7624, + "step": 119 + }, + { + "epoch": 0.768, + "grad_norm": 0.30900205953471865, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7718, + "step": 120 + }, + { + "epoch": 0.7744, + "grad_norm": 0.38846582883643044, + "learning_rate": 2.536156105513062e-05, + "loss": 0.761, + "step": 121 + }, + { + "epoch": 0.7808, + "grad_norm": 0.2900413609414705, + "learning_rate": 2.399319354583418e-05, + "loss": 0.7262, + "step": 122 + }, + { + "epoch": 0.7872, + "grad_norm": 0.34617299231482673, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7707, + "step": 123 + }, + { + "epoch": 0.7936, + "grad_norm": 0.31898208311746445, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7352, + "step": 124 + }, + { + "epoch": 0.8, + "grad_norm": 0.27769781138534744, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7545, + "step": 125 + }, + { + "epoch": 0.8064, + "grad_norm": 0.28363617441671607, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.728, + "step": 126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.302373451841636, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.7503, + "step": 127 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3271045216797624, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7423, + "step": 128 + }, + { + "epoch": 0.8256, + "grad_norm": 0.2919143721730832, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7086, + "step": 129 + }, + { + "epoch": 0.832, + "grad_norm": 0.2983360606395803, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.756, + "step": 130 + }, + { + "epoch": 0.8384, + "grad_norm": 0.2982372679255095, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7521, + "step": 131 + }, + { + "epoch": 0.8448, + "grad_norm": 0.29307631961647174, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7018, + "step": 132 + }, + { + "epoch": 0.8512, + "grad_norm": 0.40391346833649705, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.74, + "step": 133 + }, + { + "epoch": 0.8576, + "grad_norm": 0.32945897327262835, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7246, + "step": 134 + }, + { + "epoch": 0.864, + "grad_norm": 0.2949428002691319, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7463, + "step": 135 + }, + { + "epoch": 0.8704, + "grad_norm": 0.30101875454180504, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7394, + "step": 136 + }, + { + "epoch": 0.8768, + "grad_norm": 0.32082387098249604, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7855, + "step": 137 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3086004908294671, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7121, + "step": 138 + }, + { + "epoch": 0.8896, + "grad_norm": 0.30293332836986925, + "learning_rate": 6.189870894938587e-06, + "loss": 0.6969, + "step": 139 + }, + { + "epoch": 0.896, + "grad_norm": 0.31335455383145533, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7462, + "step": 140 + }, + { + "epoch": 0.9024, + "grad_norm": 0.29406534419663677, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7404, + "step": 141 + }, + { + "epoch": 0.9088, + "grad_norm": 0.28288570790135903, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7154, + "step": 142 + }, + { + "epoch": 0.9152, + "grad_norm": 0.34788707135985963, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7389, + "step": 143 + }, + { + "epoch": 0.9216, + "grad_norm": 0.290633490117723, + "learning_rate": 3.100425628282899e-06, + "loss": 0.732, + "step": 144 + }, + { + "epoch": 0.928, + "grad_norm": 0.27377340408363693, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7141, + "step": 145 + }, + { + "epoch": 0.9344, + "grad_norm": 0.32626292933455625, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7551, + "step": 146 + }, + { + "epoch": 0.9408, + "grad_norm": 0.32123614225602803, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7626, + "step": 147 + }, + { + "epoch": 0.9472, + "grad_norm": 0.35821692271121813, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7313, + "step": 148 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3958937218979211, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7032, + "step": 149 + }, + { + "epoch": 0.96, + "grad_norm": 0.2967738869621842, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7575, + "step": 150 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4158348977915751, + "learning_rate": 5.405852438937764e-07, + "loss": 0.6813, + "step": 151 + }, + { + "epoch": 0.9728, + "grad_norm": 0.31372988976948457, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6922, + "step": 152 + }, + { + "epoch": 0.9792, + "grad_norm": 0.2804679598419863, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7026, + "step": 153 + }, + { + "epoch": 0.9856, + "grad_norm": 0.30239885095076025, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7268, + "step": 154 + }, + { + "epoch": 0.992, + "grad_norm": 0.2686050777026808, + "learning_rate": 2.164213936770576e-08, + "loss": 0.6936, + "step": 155 + }, + { + "epoch": 0.9984, + "grad_norm": 0.308271729218658, + "learning_rate": 0.0, + "loss": 0.7043, + "step": 156 + }, + { + "epoch": 0.9984, + "step": 156, + "total_flos": 399576021729280.0, + "train_loss": 0.7931731194257736, + "train_runtime": 4808.9782, + "train_samples_per_second": 1.04, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 156, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 399576021729280.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5720738f4c1dd88ecb68abd6bf7f14dc39f5a1c3 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "gate_proj", + "up_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7ff46212d77e39e1ba1c94a12a7085c628ae8ae7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ee1bdeda95c7635baa01aa6d1e96db390fb5a146b26a960dad923d98a0c95 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..c05c6912e7deaa9e4327a23815a617bbf2e863f5 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6368aa85e21905e872b8c52af77664941a9fa812c94755bee30265b2664e7155 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e47681ad3379893b82bbcaf7e204bbe3c93ad6e0 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9396278575656405, + "learning_rate": 2e-05, + "loss": 1.3595, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 1.145754195917076, + "learning_rate": 4e-05, + "loss": 1.5312, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.9101096939447241, + "learning_rate": 6e-05, + "loss": 1.3688, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.9044070523413997, + "learning_rate": 8e-05, + "loss": 1.4315, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.9366614139356674, + "learning_rate": 0.0001, + "loss": 1.2066, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.697160557893367, + "learning_rate": 0.00012, + "loss": 1.1375, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8327698812333196, + "learning_rate": 0.00014, + "loss": 0.9662, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.8262069347243871, + "learning_rate": 0.00016, + "loss": 1.0, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6074353633567875, + "learning_rate": 0.00018, + "loss": 0.8859, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5809210582543565, + "learning_rate": 0.0002, + "loss": 0.9082, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5735256089738234, + "learning_rate": 0.00019999458931878073, + "loss": 0.9748, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5052310574234449, + "learning_rate": 0.0001999783578606323, + "loss": 0.8814, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5737885547299703, + "learning_rate": 0.00019995130738201966, + "loss": 0.8842, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5237047240583436, + "learning_rate": 0.0001999134408101731, + "loss": 0.8155, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.626094153559293, + "learning_rate": 0.00019986476224277165, + "loss": 0.8704, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.49669603205928, + "learning_rate": 0.00019980527694749952, + "loss": 0.8471, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5730289800881923, + "learning_rate": 0.00019973499136147606, + "loss": 0.9009, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.523720158885521, + "learning_rate": 0.0001996539130905593, + "loss": 0.9077, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5260323034094564, + "learning_rate": 0.0001995620509085228, + "loss": 0.9066, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.4652749578063743, + "learning_rate": 0.00019945941475610623, + "loss": 0.8705, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5261765722589641, + "learning_rate": 0.0001993460157399396, + "loss": 0.879, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5450535617021041, + "learning_rate": 0.0001992218661313415, + "loss": 0.9356, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.44327870329129865, + "learning_rate": 0.00019908697936499103, + "loss": 0.8761, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4694133788580677, + "learning_rate": 0.00019894137003747403, + "loss": 0.7869, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.527008252484748, + "learning_rate": 0.00019878505390570362, + "loss": 0.9204, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4701252432281297, + "learning_rate": 0.00019861804788521493, + "loss": 0.8754, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.48583022123196745, + "learning_rate": 0.00019844037004833473, + "loss": 0.8034, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4835202635149471, + "learning_rate": 0.00019825203962222572, + "loss": 0.8875, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.506312540888068, + "learning_rate": 0.0001980530769868059, + "loss": 0.8465, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.4340404299246127, + "learning_rate": 0.00019784350367254322, + "loss": 0.8459, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4500528389649417, + "learning_rate": 0.0001976233423581255, + "loss": 0.8574, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4257338464346157, + "learning_rate": 0.0001973926168680066, + "loss": 0.8621, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4373720446193359, + "learning_rate": 0.00019715135216982798, + "loss": 0.8511, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.48306945420553293, + "learning_rate": 0.0001968995743717171, + "loss": 0.8102, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.4581716737765401, + "learning_rate": 0.00019663731071946206, + "loss": 0.9058, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.47027795850992604, + "learning_rate": 0.00019636458959356316, + "loss": 0.8383, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4759605551389717, + "learning_rate": 0.0001960814405061619, + "loss": 0.832, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.44489231449356653, + "learning_rate": 0.00019578789409784727, + "loss": 0.8074, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.41453774455730197, + "learning_rate": 0.00019548398213434007, + "loss": 0.7913, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.44096292259798536, + "learning_rate": 0.00019516973750305532, + "loss": 0.8537, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4578671860720513, + "learning_rate": 0.00019484519420954354, + "loss": 0.7987, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.456394506352762, + "learning_rate": 0.00019451038737381077, + "loss": 0.9166, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5715403594135529, + "learning_rate": 0.00019416535322651818, + "loss": 0.8748, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.45471408830104054, + "learning_rate": 0.00019381012910506146, + "loss": 0.8128, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.4150435625868661, + "learning_rate": 0.00019344475344953012, + "loss": 0.8093, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4128857957966661, + "learning_rate": 0.00019306926579854821, + "loss": 0.8009, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.42300454464386417, + "learning_rate": 0.00019268370678499533, + "loss": 0.8088, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4909389370245195, + "learning_rate": 0.0001922881181316097, + "loss": 0.8211, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4204437825519599, + "learning_rate": 0.00019188254264647337, + "loss": 0.836, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.41517854198610105, + "learning_rate": 0.0001914670242183795, + "loss": 0.7663, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4126379219703116, + "learning_rate": 0.0001910416078120832, + "loss": 0.8117, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3995468225425603, + "learning_rate": 0.0001906063394634356, + "loss": 0.8237, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.44308231794851455, + "learning_rate": 0.00019016126627440237, + "loss": 0.8533, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3933044567365795, + "learning_rate": 0.00018970643640796642, + "loss": 0.7194, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.42377677279686027, + "learning_rate": 0.000189241899082916, + "loss": 0.791, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.44100566311913353, + "learning_rate": 0.00018876770456851877, + "loss": 0.807, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.42240855138098576, + "learning_rate": 0.0001882839041790818, + "loss": 0.8092, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.44453501976497334, + "learning_rate": 0.00018779055026839868, + "loss": 0.741, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4585697459896131, + "learning_rate": 0.00018728769622408423, + "loss": 0.7875, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.49505450824682223, + "learning_rate": 0.00018677539646179707, + "loss": 0.9184, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.44646042397781077, + "learning_rate": 0.00018625370641935129, + "loss": 0.8436, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.44598647925458745, + "learning_rate": 0.00018572268255071718, + "loss": 0.7649, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4214221216363781, + "learning_rate": 0.00018518238231991218, + "loss": 0.7784, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.45227184298961903, + "learning_rate": 0.00018463286419478255, + "loss": 0.8158, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.4040848091911473, + "learning_rate": 0.00018407418764067627, + "loss": 0.8471, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4687728839581517, + "learning_rate": 0.00018350641311400812, + "loss": 0.7811, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.40027728864449313, + "learning_rate": 0.0001829296020557174, + "loss": 0.7781, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.44021524315744653, + "learning_rate": 0.00018234381688461942, + "loss": 0.7774, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4053592172372432, + "learning_rate": 0.0001817491209906506, + "loss": 0.7382, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.4064108947924071, + "learning_rate": 0.00018114557872800905, + "loss": 0.778, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4573478740667755, + "learning_rate": 0.00018053325540819045, + "loss": 0.8036, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.43521713254634253, + "learning_rate": 0.0001799122172929206, + "loss": 0.7037, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4872643814124734, + "learning_rate": 0.00017928253158698473, + "loss": 0.8449, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5829514491611524, + "learning_rate": 0.0001786442664309554, + "loss": 0.9214, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.4632547941021662, + "learning_rate": 0.0001779974908938184, + "loss": 0.8129, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4094238985943658, + "learning_rate": 0.0001773422749654988, + "loss": 0.8073, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.39371642360909304, + "learning_rate": 0.00017667868954928694, + "loss": 0.7885, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5496513465587326, + "learning_rate": 0.00017600680645416583, + "loss": 0.848, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.5168778359980416, + "learning_rate": 0.00017532669838704035, + "loss": 0.7576, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.49770210847776, + "learning_rate": 0.00017463843894486937, + "loss": 0.7872, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4768471023589497, + "learning_rate": 0.0001739421026067017, + "loss": 0.9025, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.46799804884228635, + "learning_rate": 0.00017323776472561627, + "loss": 0.7977, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.6111257397554586, + "learning_rate": 0.00017252550152056795, + "loss": 0.8315, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.48243384372342013, + "learning_rate": 0.0001718053900681397, + "loss": 0.7998, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.44643797481247555, + "learning_rate": 0.00017107750829420176, + "loss": 0.8077, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5199204456546236, + "learning_rate": 0.00017034193496547902, + "loss": 0.8154, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.48964963393648087, + "learning_rate": 0.00016959874968102735, + "loss": 0.8594, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4378723113342222, + "learning_rate": 0.00016884803286362, + "loss": 0.7646, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.43319531220408636, + "learning_rate": 0.00016808986575104465, + "loss": 0.8159, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.5565230031288813, + "learning_rate": 0.00016732433038731242, + "loss": 0.8249, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4749038313521005, + "learning_rate": 0.0001665515096137797, + "loss": 0.8222, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4413848996413543, + "learning_rate": 0.00016577148706018328, + "loss": 0.8223, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.5162310450089371, + "learning_rate": 0.00016498434713559088, + "loss": 0.9414, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.477965982604003, + "learning_rate": 0.00016419017501926656, + "loss": 0.8763, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.43195358779919846, + "learning_rate": 0.0001633890566514535, + "loss": 0.7583, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4856603711240389, + "learning_rate": 0.00016258107872407375, + "loss": 0.8111, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.45508732740535235, + "learning_rate": 0.0001617663286713474, + "loss": 0.7746, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.43409683925481063, + "learning_rate": 0.00016094489466033043, + "loss": 0.7758, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4089576078302274, + "learning_rate": 0.00016011686558137448, + "loss": 0.7664, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.4572742935111086, + "learning_rate": 0.0001592823310385073, + "loss": 0.8346, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4165602802496886, + "learning_rate": 0.0001584413813397364, + "loss": 0.7574, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.42531464700157917, + "learning_rate": 0.00015759410748727662, + "loss": 0.7713, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4189404767776771, + "learning_rate": 0.00015674060116770236, + "loss": 0.7765, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.41752852415604136, + "learning_rate": 0.00015588095474202595, + "loss": 0.6862, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.48896299612656, + "learning_rate": 0.00015501526123570277, + "loss": 0.7441, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5274334652372455, + "learning_rate": 0.00015414361432856475, + "loss": 0.8666, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.47976529095518916, + "learning_rate": 0.0001532661083446829, + "loss": 0.8523, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.47384189744054706, + "learning_rate": 0.00015238283824216015, + "loss": 0.8258, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4663707910797713, + "learning_rate": 0.00015149389960285558, + "loss": 0.8216, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.40466388928952995, + "learning_rate": 0.00015059938862204127, + "loss": 0.6917, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4045145628766846, + "learning_rate": 0.00014969940209799248, + "loss": 0.787, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.40223285013388077, + "learning_rate": 0.00014879403742151283, + "loss": 0.7167, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.39586520973460043, + "learning_rate": 0.00014788339256539544, + "loss": 0.7342, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.515213255581294, + "learning_rate": 0.0001469675660738206, + "loss": 0.8779, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.49916465611083227, + "learning_rate": 0.00014604665705169237, + "loss": 0.737, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5297401665467757, + "learning_rate": 0.00014512076515391375, + "loss": 0.8885, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.48401564617054504, + "learning_rate": 0.00014418999057460276, + "loss": 0.7736, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5318711299830755, + "learning_rate": 0.0001432544340362501, + "loss": 0.8255, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.5203546383396569, + "learning_rate": 0.00014231419677881966, + "loss": 0.8579, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.4466955402618473, + "learning_rate": 0.00014136938054879283, + "loss": 0.8372, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.43889266882348626, + "learning_rate": 0.00014042008758815818, + "loss": 0.8065, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4686283414276429, + "learning_rate": 0.00013946642062334766, + "loss": 0.7721, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.46181916173875265, + "learning_rate": 0.00013850848285411994, + "loss": 0.8308, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 1.1664688148777886, + "learning_rate": 0.000137546377942393, + "loss": 0.8059, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.43732852103952685, + "learning_rate": 0.00013658021000102636, + "loss": 0.858, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4400476906570904, + "learning_rate": 0.00013561008358255468, + "loss": 0.8223, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.37050483573414034, + "learning_rate": 0.00013463610366787392, + "loss": 0.714, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.45862151205497165, + "learning_rate": 0.00013365837565488064, + "loss": 0.7781, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4442232869655172, + "learning_rate": 0.0001326770053470668, + "loss": 0.8805, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.43746776301879925, + "learning_rate": 0.0001316920989420703, + "loss": 0.7322, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4886796303948087, + "learning_rate": 0.00013070376302018287, + "loss": 0.8185, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.45164193050243195, + "learning_rate": 0.00012971210453281674, + "loss": 0.7738, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.46359406856527713, + "learning_rate": 0.000128717230790931, + "loss": 0.7534, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.47446632746929185, + "learning_rate": 0.00012771924945341906, + "loss": 0.7351, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.41229097641545315, + "learning_rate": 0.00012671826851545851, + "loss": 0.7103, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.44348387315275645, + "learning_rate": 0.0001257143962968246, + "loss": 0.7993, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.49196078833834966, + "learning_rate": 0.00012470774143016853, + "loss": 0.7868, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4338579023775582, + "learning_rate": 0.00012369841284926188, + "loss": 0.8031, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.6994487653762459, + "learning_rate": 0.00012268651977720866, + "loss": 0.8465, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.559336834442375, + "learning_rate": 0.00012167217171462566, + "loss": 0.8581, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3917694435086478, + "learning_rate": 0.0001206554784277931, + "loss": 0.7599, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3764541665137546, + "learning_rate": 0.00011963654993677645, + "loss": 0.6855, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3824503642860104, + "learning_rate": 0.00011861549650352069, + "loss": 0.7803, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.44439215452009584, + "learning_rate": 0.00011759242861991855, + "loss": 0.7088, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.4221325726629454, + "learning_rate": 0.00011656745699585371, + "loss": 0.7315, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.41321303437913515, + "learning_rate": 0.00011554069254722051, + "loss": 0.8322, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4121968181469142, + "learning_rate": 0.00011451224638392129, + "loss": 0.7856, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.40550187666587395, + "learning_rate": 0.00011348222979784289, + "loss": 0.7344, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4087353652660321, + "learning_rate": 0.00011245075425081328, + "loss": 0.7539, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.4345196746937004, + "learning_rate": 0.00011141793136253986, + "loss": 0.7812, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.433613477877486, + "learning_rate": 0.0001103838728985307, + "loss": 0.8072, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4060850663863163, + "learning_rate": 0.000109348690758, + "loss": 0.7538, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4823324231748682, + "learning_rate": 0.00010831249696175918, + "loss": 0.7305, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3634968717832757, + "learning_rate": 0.0001072754036400944, + "loss": 0.754, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.5342361721517209, + "learning_rate": 0.00010623752302063283, + "loss": 0.7521, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4128252327373045, + "learning_rate": 0.00010519896741619803, + "loss": 0.7943, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3564308128607441, + "learning_rate": 0.00010415984921265609, + "loss": 0.6861, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.36480024633628716, + "learning_rate": 0.00010312028085675391, + "loss": 0.7548, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4304158686063223, + "learning_rate": 0.00010208037484395114, + "loss": 0.6694, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.4179596357660802, + "learning_rate": 0.00010104024370624644, + "loss": 0.7384, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4450104509243802, + "learning_rate": 0.0001, + "loss": 0.7904, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.41911222525899255, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7513, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.4370069264323097, + "learning_rate": 9.791962515604887e-05, + "loss": 0.8009, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.48398984419907476, + "learning_rate": 9.687971914324607e-05, + "loss": 0.8622, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.42412653600318445, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7707, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.46585747957106305, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7407, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.43496655382328697, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7837, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4160469387945875, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7998, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.45209366925053024, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7506, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.45837697636906116, + "learning_rate": 9.065130924199998e-05, + "loss": 0.8359, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4520572966553641, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7738, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.41486392065415506, + "learning_rate": 8.858206863746018e-05, + "loss": 0.681, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.44158687529956925, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7782, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.37624162529254834, + "learning_rate": 8.651777020215712e-05, + "loss": 0.719, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.5264135969316172, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7705, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4157147562878753, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7895, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3976821965716464, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7239, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.43353829558388224, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7398, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.38614809151311424, + "learning_rate": 8.138450349647936e-05, + "loss": 0.6837, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.40695965628587244, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7224, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3705268756816273, + "learning_rate": 7.934452157220694e-05, + "loss": 0.6957, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.37224292668224535, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7061, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.39949350081200635, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7367, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4420689247752516, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7376, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.414213858078857, + "learning_rate": 7.52922585698315e-05, + "loss": 0.6978, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.44715963773825573, + "learning_rate": 7.428560370317542e-05, + "loss": 0.7319, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.419463795582743, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7685, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5305448031704577, + "learning_rate": 7.228075054658096e-05, + "loss": 0.8223, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.44307595364597574, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7755, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.3879214149914402, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7146, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.417380175997682, + "learning_rate": 6.929623697981718e-05, + "loss": 0.728, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3993602836290595, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7149, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.4114404798361249, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7508, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4081368204736118, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7242, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.41641053203328376, + "learning_rate": 6.536389633212609e-05, + "loss": 0.7608, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5162978723322689, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8776, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4416732459374655, + "learning_rate": 6.341978999897365e-05, + "loss": 0.8061, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.40922067204695417, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7072, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3910408579777948, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7045, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.4294596839944686, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7395, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.45617607800708243, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7262, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4249970983415297, + "learning_rate": 5.863061945120719e-05, + "loss": 0.6206, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.38178663102554916, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7617, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3884939823750097, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7651, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.36729627243649654, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.7126, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.47826654307607736, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7822, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.36161038152146086, + "learning_rate": 5.395334294830765e-05, + "loss": 0.6956, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.38006603015370655, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7106, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.40738285628597803, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7725, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.3374199271417341, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7519, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4761794602226593, + "learning_rate": 5.030059790200756e-05, + "loss": 0.8196, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.41660560521680234, + "learning_rate": 4.940061137795876e-05, + "loss": 0.692, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.40816431754864885, + "learning_rate": 4.850610039714444e-05, + "loss": 0.79, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4053361082334398, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7295, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.42265666409916125, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7094, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4271793829562403, + "learning_rate": 4.585638567143529e-05, + "loss": 0.6791, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3791251414844227, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7291, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.34542493641922667, + "learning_rate": 4.411904525797408e-05, + "loss": 0.707, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.36758317059781026, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7234, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.45619840247066157, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8309, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3620017253708174, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7362, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3982523955481365, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7207, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.38184227358525125, + "learning_rate": 3.988313441862553e-05, + "loss": 0.7306, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4472881830492068, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.8214, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.5409463111197855, + "learning_rate": 3.823367132865265e-05, + "loss": 0.8137, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.8781222763001304, + "learning_rate": 3.741892127592625e-05, + "loss": 0.8318, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4573105786147632, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.725, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3878888689024318, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7151, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.36990394677897365, + "learning_rate": 3.501565286440914e-05, + "loss": 0.6794, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.7414686994665193, + "learning_rate": 3.422851293981676e-05, + "loss": 0.8003, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3653209812387363, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.6701, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.38286313485527906, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.6541, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.38402242018511595, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7397, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4025903717697035, + "learning_rate": 3.115196713638e-05, + "loss": 0.7243, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.6383945559677234, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7876, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.595752420576717, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7897, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3650985592911939, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6789, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.49809803666203206, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7296, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.40778843923946057, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7261, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.3814737670780612, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7428, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.37005090547095554, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.6798, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4021157794755927, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7861, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4832224308224902, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.7694, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.5222420995257865, + "learning_rate": 2.399319354583418e-05, + "loss": 0.7208, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.46968945645061355, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7398, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.40669039374836935, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7821, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.40816318256752393, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7062, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.375101363348895, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.6283, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4324430468772202, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.731, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.38879064939889446, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7415, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3692851453372047, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7277, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3486513220133344, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.6959, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.44618582177638677, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7638, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.42547013855164556, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.7555, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.4369715149874801, + "learning_rate": 1.707039794428259e-05, + "loss": 0.6905, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.41733388331032406, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7008, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4279520566519289, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.7013, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.37575572736837176, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.695, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4435807395712066, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.8064, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.3768698645952864, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7009, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3726210345324877, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.6723, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.48204144200060334, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7801, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4046556701900165, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.6695, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4138196095845522, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7282, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.43139821829854313, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6608, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3717057475902722, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.6974, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.42052364318572816, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7487, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3569959670544782, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.6482, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.38847176021176255, + "learning_rate": 9.838733725597615e-06, + "loss": 0.6822, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.4927376300455678, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6963, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.4543632723679882, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7343, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.43839276302132524, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7572, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4132661946203213, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7811, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.462865482643029, + "learning_rate": 7.711881868390291e-06, + "loss": 0.739, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.3872644843649215, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7639, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.5394434644926044, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7964, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.40421908805526796, + "learning_rate": 6.555246550469907e-06, + "loss": 0.6796, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4243003617984817, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7039, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.45483524594960506, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7669, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.3953883062663472, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7068, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3777504055589925, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7404, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.38222760426249, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7188, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.34598602379610305, + "learning_rate": 4.516017865659949e-06, + "loss": 0.7038, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.36538375721222566, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6319, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.4020557339041932, + "learning_rate": 3.918559493838114e-06, + "loss": 0.6852, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3757675955529075, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.6756, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4004774320585294, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.6527, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.42414155376682977, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7401, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3803954584255736, + "learning_rate": 2.848647830172024e-06, + "loss": 0.7108, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.5025161262007907, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7621, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.42764338083831827, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7229, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4573366276957102, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7085, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4218408813127882, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7889, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.48315909057614376, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7131, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.4459536132032088, + "learning_rate": 1.559629951665298e-06, + "loss": 0.8133, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3720322174198035, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.6519, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3889515936331356, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6526, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4756886306246455, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7094, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.43376182376356753, + "learning_rate": 9.130206350089765e-07, + "loss": 0.749, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.4246866739631707, + "learning_rate": 7.781338686584927e-07, + "loss": 0.6724, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.367498280328426, + "learning_rate": 6.539842600603918e-07, + "loss": 0.7028, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4259633905488669, + "learning_rate": 5.405852438937764e-07, + "loss": 0.692, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.3840615778351889, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7006, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.39249924769502315, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7546, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.41658451162515586, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7532, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.495311090413998, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8044, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4467776892689143, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7202, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.40244598198055637, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6841, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.38046173309726555, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7399, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.35812245930234876, + "learning_rate": 2.164213936770576e-08, + "loss": 0.6724, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4140137745357569, + "learning_rate": 5.410681219286673e-09, + "loss": 0.7405, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3989913820387328, + "learning_rate": 0.0, + "loss": 0.6796, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 268745683697664.0, + "train_loss": 0.7839559385409722, + "train_runtime": 4830.1995, + "train_samples_per_second": 1.035, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 268745683697664.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1ca054b4e1a4cb2f87c721040f4eb090a8d1ef56 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "o_proj", + "gate_proj", + "q_proj", + "v_proj", + "up_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..10138262f50c53b138494c0a47870bd5f8b98634 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2125dd55328c02732e6e1395a2477e8c8b7d1d847459aacf6f0f3d8fb1ee1f6d +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d27b2535bc6037ac75c7d592638492046e7e0b6 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f463687a87e9276099a7514b3bd94e68802cac249240f9380f3c4a7f1e15a1f2 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..354e3ad8b86c59eae4993818defdd11d2c9d5dc5 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,1134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064, + "grad_norm": 0.9499681432110237, + "learning_rate": 4e-05, + "loss": 1.4454, + "step": 1 + }, + { + "epoch": 0.0128, + "grad_norm": 0.9758169479697124, + "learning_rate": 8e-05, + "loss": 1.5182, + "step": 2 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7382712135428642, + "learning_rate": 0.00012, + "loss": 1.3954, + "step": 3 + }, + { + "epoch": 0.0256, + "grad_norm": 1.089772647132298, + "learning_rate": 0.00016, + "loss": 1.1687, + "step": 4 + }, + { + "epoch": 0.032, + "grad_norm": 1.0435192868527843, + "learning_rate": 0.0002, + "loss": 1.0096, + "step": 5 + }, + { + "epoch": 0.0384, + "grad_norm": 0.7711189389458536, + "learning_rate": 0.0001999783578606323, + "loss": 1.021, + "step": 6 + }, + { + "epoch": 0.0448, + "grad_norm": 0.48985254526410515, + "learning_rate": 0.0001999134408101731, + "loss": 0.8913, + "step": 7 + }, + { + "epoch": 0.0512, + "grad_norm": 0.47364793976366343, + "learning_rate": 0.00019980527694749952, + "loss": 0.9003, + "step": 8 + }, + { + "epoch": 0.0576, + "grad_norm": 0.49327744700652487, + "learning_rate": 0.0001996539130905593, + "loss": 0.9267, + "step": 9 + }, + { + "epoch": 0.064, + "grad_norm": 0.45752990733305904, + "learning_rate": 0.00019945941475610623, + "loss": 0.9139, + "step": 10 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4724605651441254, + "learning_rate": 0.0001992218661313415, + "loss": 0.9408, + "step": 11 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4153618059983818, + "learning_rate": 0.00019894137003747403, + "loss": 0.8533, + "step": 12 + }, + { + "epoch": 0.0832, + "grad_norm": 0.43872032715248077, + "learning_rate": 0.00019861804788521493, + "loss": 0.9176, + "step": 13 + }, + { + "epoch": 0.0896, + "grad_norm": 0.41300429747993544, + "learning_rate": 0.00019825203962222572, + "loss": 0.8658, + "step": 14 + }, + { + "epoch": 0.096, + "grad_norm": 0.3869759096230377, + "learning_rate": 0.00019784350367254322, + "loss": 0.8707, + "step": 15 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3723540582481531, + "learning_rate": 0.0001973926168680066, + "loss": 0.8743, + "step": 16 + }, + { + "epoch": 0.1088, + "grad_norm": 0.43460617257627365, + "learning_rate": 0.0001968995743717171, + "loss": 0.8422, + "step": 17 + }, + { + "epoch": 0.1152, + "grad_norm": 0.3636882439505951, + "learning_rate": 0.00019636458959356316, + "loss": 0.885, + "step": 18 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3767420330312791, + "learning_rate": 0.00019578789409784727, + "loss": 0.8368, + "step": 19 + }, + { + "epoch": 0.128, + "grad_norm": 0.3406912995511453, + "learning_rate": 0.00019516973750305532, + "loss": 0.835, + "step": 20 + }, + { + "epoch": 0.1344, + "grad_norm": 0.33117330188035315, + "learning_rate": 0.00019451038737381077, + "loss": 0.8708, + "step": 21 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3742514806628294, + "learning_rate": 0.00019381012910506146, + "loss": 0.8547, + "step": 22 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3318707541389069, + "learning_rate": 0.00019306926579854821, + "loss": 0.8137, + "step": 23 + }, + { + "epoch": 0.1536, + "grad_norm": 0.34753505685394287, + "learning_rate": 0.0001922881181316097, + "loss": 0.8179, + "step": 24 + }, + { + "epoch": 0.16, + "grad_norm": 0.30730689317524723, + "learning_rate": 0.0001914670242183795, + "loss": 0.809, + "step": 25 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3148215426142574, + "learning_rate": 0.0001906063394634356, + "loss": 0.8289, + "step": 26 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3134692640328103, + "learning_rate": 0.00018970643640796642, + "loss": 0.7951, + "step": 27 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3338606409194597, + "learning_rate": 0.00018876770456851877, + "loss": 0.801, + "step": 28 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3217309263031535, + "learning_rate": 0.00018779055026839868, + "loss": 0.781, + "step": 29 + }, + { + "epoch": 0.192, + "grad_norm": 0.3624509351724031, + "learning_rate": 0.00018677539646179707, + "loss": 0.86, + "step": 30 + }, + { + "epoch": 0.1984, + "grad_norm": 0.37146419689776405, + "learning_rate": 0.00018572268255071718, + "loss": 0.8107, + "step": 31 + }, + { + "epoch": 0.2048, + "grad_norm": 0.31965751320137953, + "learning_rate": 0.00018463286419478255, + "loss": 0.8001, + "step": 32 + }, + { + "epoch": 0.2112, + "grad_norm": 0.30710001958247035, + "learning_rate": 0.00018350641311400812, + "loss": 0.8178, + "step": 33 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3226409081760079, + "learning_rate": 0.00018234381688461942, + "loss": 0.7828, + "step": 34 + }, + { + "epoch": 0.224, + "grad_norm": 0.30238598320280313, + "learning_rate": 0.00018114557872800905, + "loss": 0.7647, + "step": 35 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3272140288265171, + "learning_rate": 0.0001799122172929206, + "loss": 0.7543, + "step": 36 + }, + { + "epoch": 0.2368, + "grad_norm": 0.37431785142236074, + "learning_rate": 0.0001786442664309554, + "loss": 0.8817, + "step": 37 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3040480104274814, + "learning_rate": 0.0001773422749654988, + "loss": 0.8085, + "step": 38 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3510312247121624, + "learning_rate": 0.00017600680645416583, + "loss": 0.8227, + "step": 39 + }, + { + "epoch": 0.256, + "grad_norm": 0.3423625936053891, + "learning_rate": 0.00017463843894486937, + "loss": 0.7742, + "step": 40 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3648454521920254, + "learning_rate": 0.00017323776472561627, + "loss": 0.8479, + "step": 41 + }, + { + "epoch": 0.2688, + "grad_norm": 0.37752553285739987, + "learning_rate": 0.0001718053900681397, + "loss": 0.8158, + "step": 42 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5139251891902187, + "learning_rate": 0.00017034193496547902, + "loss": 0.8121, + "step": 43 + }, + { + "epoch": 0.2816, + "grad_norm": 0.33201787227820184, + "learning_rate": 0.00016884803286362, + "loss": 0.8126, + "step": 44 + }, + { + "epoch": 0.288, + "grad_norm": 0.36263163138465143, + "learning_rate": 0.00016732433038731242, + "loss": 0.8131, + "step": 45 + }, + { + "epoch": 0.2944, + "grad_norm": 0.33167025843660425, + "learning_rate": 0.00016577148706018328, + "loss": 0.8204, + "step": 46 + }, + { + "epoch": 0.3008, + "grad_norm": 0.35387812318119244, + "learning_rate": 0.00016419017501926656, + "loss": 0.9064, + "step": 47 + }, + { + "epoch": 0.3072, + "grad_norm": 0.32546257173439347, + "learning_rate": 0.00016258107872407375, + "loss": 0.7826, + "step": 48 + }, + { + "epoch": 0.3136, + "grad_norm": 0.34640750110083973, + "learning_rate": 0.00016094489466033043, + "loss": 0.7741, + "step": 49 + }, + { + "epoch": 0.32, + "grad_norm": 0.3226221675147661, + "learning_rate": 0.0001592823310385073, + "loss": 0.791, + "step": 50 + }, + { + "epoch": 0.3264, + "grad_norm": 0.315562950778229, + "learning_rate": 0.00015759410748727662, + "loss": 0.7615, + "step": 51 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3594636590102823, + "learning_rate": 0.00015588095474202595, + "loss": 0.7334, + "step": 52 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3647397045233328, + "learning_rate": 0.00015414361432856475, + "loss": 0.8031, + "step": 53 + }, + { + "epoch": 0.3456, + "grad_norm": 0.33851642590902986, + "learning_rate": 0.00015238283824216015, + "loss": 0.837, + "step": 54 + }, + { + "epoch": 0.352, + "grad_norm": 0.29718756649280836, + "learning_rate": 0.00015059938862204127, + "loss": 0.751, + "step": 55 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3142832139988199, + "learning_rate": 0.00014879403742151283, + "loss": 0.7477, + "step": 56 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3325477610175841, + "learning_rate": 0.0001469675660738206, + "loss": 0.8051, + "step": 57 + }, + { + "epoch": 0.3712, + "grad_norm": 0.37146073441055394, + "learning_rate": 0.00014512076515391375, + "loss": 0.8098, + "step": 58 + }, + { + "epoch": 0.3776, + "grad_norm": 0.36048313195055315, + "learning_rate": 0.0001432544340362501, + "loss": 0.7997, + "step": 59 + }, + { + "epoch": 0.384, + "grad_norm": 0.3500392080646858, + "learning_rate": 0.00014136938054879283, + "loss": 0.8462, + "step": 60 + }, + { + "epoch": 0.3904, + "grad_norm": 0.31468963310853965, + "learning_rate": 0.00013946642062334766, + "loss": 0.7831, + "step": 61 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4084100906531495, + "learning_rate": 0.000137546377942393, + "loss": 0.8192, + "step": 62 + }, + { + "epoch": 0.4032, + "grad_norm": 0.33355778834850514, + "learning_rate": 0.00013561008358255468, + "loss": 0.8336, + "step": 63 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3121111615175075, + "learning_rate": 0.00013365837565488064, + "loss": 0.7478, + "step": 64 + }, + { + "epoch": 0.416, + "grad_norm": 0.330025473226303, + "learning_rate": 0.0001316920989420703, + "loss": 0.8114, + "step": 65 + }, + { + "epoch": 0.4224, + "grad_norm": 0.32343888780989766, + "learning_rate": 0.00012971210453281674, + "loss": 0.7916, + "step": 66 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3303905759396039, + "learning_rate": 0.00012771924945341906, + "loss": 0.7448, + "step": 67 + }, + { + "epoch": 0.4352, + "grad_norm": 0.33442251076374147, + "learning_rate": 0.0001257143962968246, + "loss": 0.7537, + "step": 68 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3253341707475316, + "learning_rate": 0.00012369841284926188, + "loss": 0.7906, + "step": 69 + }, + { + "epoch": 0.448, + "grad_norm": 0.40322939877722475, + "learning_rate": 0.00012167217171462566, + "loss": 0.8455, + "step": 70 + }, + { + "epoch": 0.4544, + "grad_norm": 0.2894092016470807, + "learning_rate": 0.00011963654993677645, + "loss": 0.7185, + "step": 71 + }, + { + "epoch": 0.4608, + "grad_norm": 0.31132315142861977, + "learning_rate": 0.00011759242861991855, + "loss": 0.7429, + "step": 72 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3034494888254519, + "learning_rate": 0.00011554069254722051, + "loss": 0.7781, + "step": 73 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3015266217194556, + "learning_rate": 0.00011348222979784289, + "loss": 0.7584, + "step": 74 + }, + { + "epoch": 0.48, + "grad_norm": 0.321935083376902, + "learning_rate": 0.00011141793136253986, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 0.4864, + "grad_norm": 0.33496778793189047, + "learning_rate": 0.000109348690758, + "loss": 0.7769, + "step": 76 + }, + { + "epoch": 0.4928, + "grad_norm": 0.33483461042093743, + "learning_rate": 0.0001072754036400944, + "loss": 0.7391, + "step": 77 + }, + { + "epoch": 0.4992, + "grad_norm": 0.35029433823654627, + "learning_rate": 0.00010519896741619803, + "loss": 0.7702, + "step": 78 + }, + { + "epoch": 0.5056, + "grad_norm": 0.27106878465082573, + "learning_rate": 0.00010312028085675391, + "loss": 0.7186, + "step": 79 + }, + { + "epoch": 0.512, + "grad_norm": 0.31734658980806324, + "learning_rate": 0.00010104024370624644, + "loss": 0.7045, + "step": 80 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3159312041607431, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7688, + "step": 81 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3510053326354264, + "learning_rate": 9.687971914324607e-05, + "loss": 0.8354, + "step": 82 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3313711207225331, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7626, + "step": 83 + }, + { + "epoch": 0.5376, + "grad_norm": 0.31851365189344344, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7941, + "step": 84 + }, + { + "epoch": 0.544, + "grad_norm": 0.3683419820928145, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7969, + "step": 85 + }, + { + "epoch": 0.5504, + "grad_norm": 0.303900770492725, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7311, + "step": 86 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3195515925884969, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7518, + "step": 87 + }, + { + "epoch": 0.5632, + "grad_norm": 0.34328458663535893, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7791, + "step": 88 + }, + { + "epoch": 0.5696, + "grad_norm": 0.29192287357056207, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7308, + "step": 89 + }, + { + "epoch": 0.576, + "grad_norm": 0.29828259930043177, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7037, + "step": 90 + }, + { + "epoch": 0.5824, + "grad_norm": 0.2828954581458353, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7057, + "step": 91 + }, + { + "epoch": 0.5888, + "grad_norm": 0.30315109744782637, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7378, + "step": 92 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3134055693364635, + "learning_rate": 7.428560370317542e-05, + "loss": 0.7212, + "step": 93 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3451596219699706, + "learning_rate": 7.228075054658096e-05, + "loss": 0.8015, + "step": 94 + }, + { + "epoch": 0.608, + "grad_norm": 0.29775782579656, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7456, + "step": 95 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3291043205732606, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7262, + "step": 96 + }, + { + "epoch": 0.6208, + "grad_norm": 0.2906190303452574, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7429, + "step": 97 + }, + { + "epoch": 0.6272, + "grad_norm": 0.34321569626617143, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8265, + "step": 98 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3158233835384171, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7595, + "step": 99 + }, + { + "epoch": 0.64, + "grad_norm": 0.30111907255901293, + "learning_rate": 6.053357937665237e-05, + "loss": 0.726, + "step": 100 + }, + { + "epoch": 0.6464, + "grad_norm": 0.30986176836690604, + "learning_rate": 5.863061945120719e-05, + "loss": 0.6807, + "step": 101 + }, + { + "epoch": 0.6528, + "grad_norm": 0.29450367356819085, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7687, + "step": 102 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3310156455760937, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7563, + "step": 103 + }, + { + "epoch": 0.6656, + "grad_norm": 0.2677526853851952, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.708, + "step": 104 + }, + { + "epoch": 0.672, + "grad_norm": 0.3035262178953314, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7715, + "step": 105 + }, + { + "epoch": 0.6784, + "grad_norm": 0.33621579158338233, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7644, + "step": 106 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3010244557747383, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7646, + "step": 107 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3067826338841327, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7047, + "step": 108 + }, + { + "epoch": 0.6976, + "grad_norm": 0.26568750293000415, + "learning_rate": 4.411904525797408e-05, + "loss": 0.723, + "step": 109 + }, + { + "epoch": 0.704, + "grad_norm": 0.2989424371151441, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7847, + "step": 110 + }, + { + "epoch": 0.7104, + "grad_norm": 0.2917555500270605, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7342, + "step": 111 + }, + { + "epoch": 0.7168, + "grad_norm": 0.29628166341367834, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.782, + "step": 112 + }, + { + "epoch": 0.7232, + "grad_norm": 0.39119195663022793, + "learning_rate": 3.741892127592625e-05, + "loss": 0.8312, + "step": 113 + }, + { + "epoch": 0.7296, + "grad_norm": 0.30694775810386343, + "learning_rate": 3.580982498073344e-05, + "loss": 0.726, + "step": 114 + }, + { + "epoch": 0.736, + "grad_norm": 0.310752903744039, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7483, + "step": 115 + }, + { + "epoch": 0.7424, + "grad_norm": 0.26099588812120716, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.6752, + "step": 116 + }, + { + "epoch": 0.7488, + "grad_norm": 0.28492516975887444, + "learning_rate": 3.115196713638e-05, + "loss": 0.7416, + "step": 117 + }, + { + "epoch": 0.7552, + "grad_norm": 0.34517639491760527, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7962, + "step": 118 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3060176941402745, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7104, + "step": 119 + }, + { + "epoch": 0.768, + "grad_norm": 0.2828848083006681, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7432, + "step": 120 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2777162172957675, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7413, + "step": 121 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3170164491566114, + "learning_rate": 2.399319354583418e-05, + "loss": 0.7569, + "step": 122 + }, + { + "epoch": 0.7872, + "grad_norm": 0.321325071237149, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7699, + "step": 123 + }, + { + "epoch": 0.7936, + "grad_norm": 0.27620495853360366, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.6775, + "step": 124 + }, + { + "epoch": 0.8, + "grad_norm": 0.2941483196149313, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7463, + "step": 125 + }, + { + "epoch": 0.8064, + "grad_norm": 0.2638396195061165, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.719, + "step": 126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3195419763352248, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.7671, + "step": 127 + }, + { + "epoch": 0.8192, + "grad_norm": 0.29556898125453157, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7125, + "step": 128 + }, + { + "epoch": 0.8256, + "grad_norm": 0.29771793027755095, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7065, + "step": 129 + }, + { + "epoch": 0.832, + "grad_norm": 0.2922812432322988, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7679, + "step": 130 + }, + { + "epoch": 0.8384, + "grad_norm": 0.2878303583502017, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7354, + "step": 131 + }, + { + "epoch": 0.8448, + "grad_norm": 0.2977577857426242, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7079, + "step": 132 + }, + { + "epoch": 0.8512, + "grad_norm": 0.2901915437404231, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.6891, + "step": 133 + }, + { + "epoch": 0.8576, + "grad_norm": 0.28352945726609136, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7081, + "step": 134 + }, + { + "epoch": 0.864, + "grad_norm": 0.3241683039687902, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7052, + "step": 135 + }, + { + "epoch": 0.8704, + "grad_norm": 0.3297607032361027, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7568, + "step": 136 + }, + { + "epoch": 0.8768, + "grad_norm": 0.31796151841422116, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7706, + "step": 137 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3487317539823554, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7904, + "step": 138 + }, + { + "epoch": 0.8896, + "grad_norm": 0.30800851312340427, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7017, + "step": 139 + }, + { + "epoch": 0.896, + "grad_norm": 0.3099184827803778, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7483, + "step": 140 + }, + { + "epoch": 0.9024, + "grad_norm": 0.31481061184272635, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7396, + "step": 141 + }, + { + "epoch": 0.9088, + "grad_norm": 0.2681021791486132, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6792, + "step": 142 + }, + { + "epoch": 0.9152, + "grad_norm": 0.2964128919386318, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.6899, + "step": 143 + }, + { + "epoch": 0.9216, + "grad_norm": 0.31152184572681707, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7062, + "step": 144 + }, + { + "epoch": 0.928, + "grad_norm": 0.3385766223989507, + "learning_rate": 2.607383131993424e-06, + "loss": 0.745, + "step": 145 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3580776963221539, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7262, + "step": 146 + }, + { + "epoch": 0.9408, + "grad_norm": 0.33745749791756396, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7639, + "step": 147 + }, + { + "epoch": 0.9472, + "grad_norm": 0.2898419277646106, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7454, + "step": 148 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3068386648982399, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6922, + "step": 149 + }, + { + "epoch": 0.96, + "grad_norm": 0.3052370320816486, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7228, + "step": 150 + }, + { + "epoch": 0.9664, + "grad_norm": 0.2855813856814325, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7091, + "step": 151 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3812616847566796, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7375, + "step": 152 + }, + { + "epoch": 0.9792, + "grad_norm": 0.34762263468639226, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7907, + "step": 153 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3071632002763469, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7126, + "step": 154 + }, + { + "epoch": 0.992, + "grad_norm": 0.2743953863530211, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7176, + "step": 155 + }, + { + "epoch": 0.9984, + "grad_norm": 0.34240904021127194, + "learning_rate": 0.0, + "loss": 0.7214, + "step": 156 + }, + { + "epoch": 0.9984, + "step": 156, + "total_flos": 391925396799488.0, + "train_loss": 0.7946632943856411, + "train_runtime": 4833.0764, + "train_samples_per_second": 1.035, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 156, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 391925396799488.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd791ba0e59c63ed92e0aac5f633b0609a9ccb9f --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "q_proj", + "up_proj", + "down_proj", + "gate_proj", + "o_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5d92c2d48634080679166b1702b7732636f61cc6 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd86f5378de423b831a40299e4fdd9539d90ceef220c946c3fe1b4905549d668 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f8979bc20bc02f43ada5abf62484595711d9481 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dac86a923535fb17ad6727e74e80ce96d588fad79ee0191bd14195b64f19a13d +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1edf24cfcbf106106f3fabaf556b9e35f22a84ef --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 1.0996556147448786, + "learning_rate": 2e-05, + "loss": 1.4738, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 1.155590197159421, + "learning_rate": 4e-05, + "loss": 1.6051, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 1.146820508048761, + "learning_rate": 6e-05, + "loss": 1.259, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.9163565348699131, + "learning_rate": 8e-05, + "loss": 1.3473, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.9417661773161884, + "learning_rate": 0.0001, + "loss": 1.1457, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.0858107165146322, + "learning_rate": 0.00012, + "loss": 1.0252, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.9643047603536231, + "learning_rate": 0.00014, + "loss": 1.0901, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7884322251663225, + "learning_rate": 0.00016, + "loss": 1.0381, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6207317444770607, + "learning_rate": 0.00018, + "loss": 0.9786, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5542708757243915, + "learning_rate": 0.0002, + "loss": 0.9105, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6661573203112948, + "learning_rate": 0.00019999458931878073, + "loss": 0.9134, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.581721865275086, + "learning_rate": 0.0001999783578606323, + "loss": 0.8831, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.6258602792844252, + "learning_rate": 0.00019995130738201966, + "loss": 0.9742, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.8144383904909923, + "learning_rate": 0.0001999134408101731, + "loss": 0.8465, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.5506198986379944, + "learning_rate": 0.00019986476224277165, + "loss": 0.8677, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5961739562843801, + "learning_rate": 0.00019980527694749952, + "loss": 0.9714, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.629664764526395, + "learning_rate": 0.00019973499136147606, + "loss": 0.8722, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5227064497725998, + "learning_rate": 0.0001996539130905593, + "loss": 0.9123, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5224978847489002, + "learning_rate": 0.0001995620509085228, + "loss": 0.9474, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.5416788737330189, + "learning_rate": 0.00019945941475610623, + "loss": 0.8616, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4664391292587506, + "learning_rate": 0.0001993460157399396, + "loss": 0.873, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6092522246389432, + "learning_rate": 0.0001992218661313415, + "loss": 0.9629, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.529580513692818, + "learning_rate": 0.00019908697936499103, + "loss": 0.8706, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4625768462826553, + "learning_rate": 0.00019894137003747403, + "loss": 0.8322, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.6022314082445355, + "learning_rate": 0.00019878505390570362, + "loss": 0.8923, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.495252362666634, + "learning_rate": 0.00019861804788521493, + "loss": 0.8946, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5573210343339869, + "learning_rate": 0.00019844037004833473, + "loss": 0.8745, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5263467061927173, + "learning_rate": 0.00019825203962222572, + "loss": 0.8315, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.45530522891080205, + "learning_rate": 0.0001980530769868059, + "loss": 0.8728, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.4488557169584717, + "learning_rate": 0.00019784350367254322, + "loss": 0.8714, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.422211135636021, + "learning_rate": 0.0001976233423581255, + "loss": 0.8085, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.43629793688244217, + "learning_rate": 0.0001973926168680066, + "loss": 0.8506, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4934816820721558, + "learning_rate": 0.00019715135216982798, + "loss": 0.9185, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.49422231415258716, + "learning_rate": 0.0001968995743717171, + "loss": 0.881, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.42607497457838045, + "learning_rate": 0.00019663731071946206, + "loss": 0.8277, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5082800492131699, + "learning_rate": 0.00019636458959356316, + "loss": 0.7598, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4241801294691474, + "learning_rate": 0.0001960814405061619, + "loss": 0.8652, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.41316516092448563, + "learning_rate": 0.00019578789409784727, + "loss": 0.8215, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5063945285706223, + "learning_rate": 0.00019548398213434007, + "loss": 0.8528, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.5019548596596508, + "learning_rate": 0.00019516973750305532, + "loss": 0.8396, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.48735346313769123, + "learning_rate": 0.00019484519420954354, + "loss": 0.8144, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5197868954555636, + "learning_rate": 0.00019451038737381077, + "loss": 0.8153, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.48803173140285644, + "learning_rate": 0.00019416535322651818, + "loss": 0.8591, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.43518894315913303, + "learning_rate": 0.00019381012910506146, + "loss": 0.8188, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.5364419130913426, + "learning_rate": 0.00019344475344953012, + "loss": 0.9385, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.435592143632023, + "learning_rate": 0.00019306926579854821, + "loss": 0.7764, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.48004579520498175, + "learning_rate": 0.00019268370678499533, + "loss": 0.7933, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.527366295217339, + "learning_rate": 0.0001922881181316097, + "loss": 0.8798, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5380112089762901, + "learning_rate": 0.00019188254264647337, + "loss": 0.8508, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.46806208738974403, + "learning_rate": 0.0001914670242183795, + "loss": 0.8426, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.44311714881174813, + "learning_rate": 0.0001910416078120832, + "loss": 0.7469, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.45340181396943946, + "learning_rate": 0.0001906063394634356, + "loss": 0.8339, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4544588923922235, + "learning_rate": 0.00019016126627440237, + "loss": 0.8198, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4488945712149492, + "learning_rate": 0.00018970643640796642, + "loss": 0.8535, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.4697023213136025, + "learning_rate": 0.000189241899082916, + "loss": 0.8351, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.44940044996293804, + "learning_rate": 0.00018876770456851877, + "loss": 0.8159, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5035390443723099, + "learning_rate": 0.0001882839041790818, + "loss": 0.8946, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5032674537128853, + "learning_rate": 0.00018779055026839868, + "loss": 0.8703, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.42842145274459875, + "learning_rate": 0.00018728769622408423, + "loss": 0.8642, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.4972268182951825, + "learning_rate": 0.00018677539646179707, + "loss": 0.7786, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5821189188510526, + "learning_rate": 0.00018625370641935129, + "loss": 0.8606, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.40727673401873243, + "learning_rate": 0.00018572268255071718, + "loss": 0.7698, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.42146358155953934, + "learning_rate": 0.00018518238231991218, + "loss": 0.7587, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5060938378098108, + "learning_rate": 0.00018463286419478255, + "loss": 0.825, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.47374672312323146, + "learning_rate": 0.00018407418764067627, + "loss": 0.8218, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.39289842872392167, + "learning_rate": 0.00018350641311400812, + "loss": 0.7907, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.40504091920518287, + "learning_rate": 0.0001829296020557174, + "loss": 0.7731, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.46608745877751256, + "learning_rate": 0.00018234381688461942, + "loss": 0.8955, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4945171098342708, + "learning_rate": 0.0001817491209906506, + "loss": 0.8577, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.8348779091493823, + "learning_rate": 0.00018114557872800905, + "loss": 0.797, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.43537627735797024, + "learning_rate": 0.00018053325540819045, + "loss": 0.884, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.44115911348304754, + "learning_rate": 0.0001799122172929206, + "loss": 0.7575, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.41669159487312485, + "learning_rate": 0.00017928253158698473, + "loss": 0.8395, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.43611174569275457, + "learning_rate": 0.0001786442664309554, + "loss": 0.7673, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.48683682101110376, + "learning_rate": 0.0001779974908938184, + "loss": 0.8394, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.39225378081689416, + "learning_rate": 0.0001773422749654988, + "loss": 0.7666, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.461179637171612, + "learning_rate": 0.00017667868954928694, + "loss": 0.7901, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4411987636616236, + "learning_rate": 0.00017600680645416583, + "loss": 0.8018, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.5385646530712281, + "learning_rate": 0.00017532669838704035, + "loss": 0.8043, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.4357285533429329, + "learning_rate": 0.00017463843894486937, + "loss": 0.8693, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.45406278912963355, + "learning_rate": 0.0001739421026067017, + "loss": 0.8352, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.45523658265619166, + "learning_rate": 0.00017323776472561627, + "loss": 0.8385, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4244244156508769, + "learning_rate": 0.00017252550152056795, + "loss": 0.7901, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4569043840214544, + "learning_rate": 0.0001718053900681397, + "loss": 0.8455, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.45844501767610796, + "learning_rate": 0.00017107750829420176, + "loss": 0.8152, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5322576158474753, + "learning_rate": 0.00017034193496547902, + "loss": 0.7228, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4138777808285333, + "learning_rate": 0.00016959874968102735, + "loss": 0.8249, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4799177917816522, + "learning_rate": 0.00016884803286362, + "loss": 0.7839, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.47193551015131996, + "learning_rate": 0.00016808986575104465, + "loss": 0.8329, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.505900314758877, + "learning_rate": 0.00016732433038731242, + "loss": 0.8218, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5310009281948525, + "learning_rate": 0.0001665515096137797, + "loss": 0.7454, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.43552333332885873, + "learning_rate": 0.00016577148706018328, + "loss": 0.8166, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.41823268630022065, + "learning_rate": 0.00016498434713559088, + "loss": 0.8135, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.42006171215316657, + "learning_rate": 0.00016419017501926656, + "loss": 0.8138, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.48524525282413145, + "learning_rate": 0.0001633890566514535, + "loss": 0.8095, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.444659269135687, + "learning_rate": 0.00016258107872407375, + "loss": 0.7406, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.39882715463751306, + "learning_rate": 0.0001617663286713474, + "loss": 0.7929, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.40893464140153446, + "learning_rate": 0.00016094489466033043, + "loss": 0.7789, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4556551565194326, + "learning_rate": 0.00016011686558137448, + "loss": 0.8106, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.4710503877413765, + "learning_rate": 0.0001592823310385073, + "loss": 0.8376, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.44873922703843017, + "learning_rate": 0.0001584413813397364, + "loss": 0.7397, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4780873178804398, + "learning_rate": 0.00015759410748727662, + "loss": 0.6855, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4292692486434121, + "learning_rate": 0.00015674060116770236, + "loss": 0.7567, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.451175302234134, + "learning_rate": 0.00015588095474202595, + "loss": 0.7917, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.38705001178653353, + "learning_rate": 0.00015501526123570277, + "loss": 0.7167, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4969376218965549, + "learning_rate": 0.00015414361432856475, + "loss": 0.7367, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4548344658229783, + "learning_rate": 0.0001532661083446829, + "loss": 0.7754, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4224808161911037, + "learning_rate": 0.00015238283824216015, + "loss": 0.7699, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3594920451103802, + "learning_rate": 0.00015149389960285558, + "loss": 0.7257, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.526316503013457, + "learning_rate": 0.00015059938862204127, + "loss": 0.8518, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4354561024147389, + "learning_rate": 0.00014969940209799248, + "loss": 0.7457, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3834512633324317, + "learning_rate": 0.00014879403742151283, + "loss": 0.783, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.42602748759310605, + "learning_rate": 0.00014788339256539544, + "loss": 0.8114, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.46161254448655237, + "learning_rate": 0.0001469675660738206, + "loss": 0.8501, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.4647927281034576, + "learning_rate": 0.00014604665705169237, + "loss": 0.8369, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.38934195690887574, + "learning_rate": 0.00014512076515391375, + "loss": 0.7882, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.6055647046806627, + "learning_rate": 0.00014418999057460276, + "loss": 0.798, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.48064016323776393, + "learning_rate": 0.0001432544340362501, + "loss": 0.8316, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4036750964150837, + "learning_rate": 0.00014231419677881966, + "loss": 0.7445, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.4069960376767926, + "learning_rate": 0.00014136938054879283, + "loss": 0.7529, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3780256529635664, + "learning_rate": 0.00014042008758815818, + "loss": 0.7842, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.37948188830044266, + "learning_rate": 0.00013946642062334766, + "loss": 0.7771, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.38434792324076283, + "learning_rate": 0.00013850848285411994, + "loss": 0.711, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4078342044913948, + "learning_rate": 0.000137546377942393, + "loss": 0.7397, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.4494696168619162, + "learning_rate": 0.00013658021000102636, + "loss": 0.7915, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3762112960128191, + "learning_rate": 0.00013561008358255468, + "loss": 0.7708, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4128183849828177, + "learning_rate": 0.00013463610366787392, + "loss": 0.8064, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.43747572041533334, + "learning_rate": 0.00013365837565488064, + "loss": 0.7968, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.37451679628856716, + "learning_rate": 0.0001326770053470668, + "loss": 0.823, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.49343862517101406, + "learning_rate": 0.0001316920989420703, + "loss": 0.797, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4059370775780427, + "learning_rate": 0.00013070376302018287, + "loss": 0.7822, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4765446987311867, + "learning_rate": 0.00012971210453281674, + "loss": 0.8045, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.44304669968975896, + "learning_rate": 0.000128717230790931, + "loss": 0.7983, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4495572284704048, + "learning_rate": 0.00012771924945341906, + "loss": 0.8173, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.49519931957917346, + "learning_rate": 0.00012671826851545851, + "loss": 0.7146, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.45440934320219123, + "learning_rate": 0.0001257143962968246, + "loss": 0.7902, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.46136539516278813, + "learning_rate": 0.00012470774143016853, + "loss": 0.8228, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.46998132335066334, + "learning_rate": 0.00012369841284926188, + "loss": 0.7491, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4857141173134919, + "learning_rate": 0.00012268651977720866, + "loss": 0.7795, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.37304693609839623, + "learning_rate": 0.00012167217171462566, + "loss": 0.7062, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.45211235804047367, + "learning_rate": 0.0001206554784277931, + "loss": 0.7339, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.46871049328137226, + "learning_rate": 0.00011963654993677645, + "loss": 0.8194, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.40545006074712786, + "learning_rate": 0.00011861549650352069, + "loss": 0.7521, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5132829944284613, + "learning_rate": 0.00011759242861991855, + "loss": 0.8219, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.4217086703740498, + "learning_rate": 0.00011656745699585371, + "loss": 0.8332, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.43581538181417107, + "learning_rate": 0.00011554069254722051, + "loss": 0.7329, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.43691731849117116, + "learning_rate": 0.00011451224638392129, + "loss": 0.8389, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.43966396101540145, + "learning_rate": 0.00011348222979784289, + "loss": 0.7951, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.38406120627380513, + "learning_rate": 0.00011245075425081328, + "loss": 0.7095, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.49975454028098837, + "learning_rate": 0.00011141793136253986, + "loss": 0.7603, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.39940277248775485, + "learning_rate": 0.0001103838728985307, + "loss": 0.6845, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.409292796075142, + "learning_rate": 0.000109348690758, + "loss": 0.7485, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4620447146843351, + "learning_rate": 0.00010831249696175918, + "loss": 0.7555, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4949103640758731, + "learning_rate": 0.0001072754036400944, + "loss": 0.7818, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.48693309628498127, + "learning_rate": 0.00010623752302063283, + "loss": 0.822, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4275166982224059, + "learning_rate": 0.00010519896741619803, + "loss": 0.7868, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.47251853589629417, + "learning_rate": 0.00010415984921265609, + "loss": 0.7993, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4408106892384705, + "learning_rate": 0.00010312028085675391, + "loss": 0.7888, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.5455614880532736, + "learning_rate": 0.00010208037484395114, + "loss": 0.8227, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.4297050359627414, + "learning_rate": 0.00010104024370624644, + "loss": 0.7715, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.39100077443136555, + "learning_rate": 0.0001, + "loss": 0.7334, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.46422894754991223, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7624, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.37146356743726766, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7447, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.38940213180935757, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7478, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.43337090859888355, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7858, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4449843824081987, + "learning_rate": 9.480103258380198e-05, + "loss": 0.6978, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4444574240792365, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7824, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.39391177448125786, + "learning_rate": 9.272459635990562e-05, + "loss": 0.6804, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.434169523981853, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7449, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.4052559568894996, + "learning_rate": 9.065130924199998e-05, + "loss": 0.757, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4093093475639199, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7287, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.5057830506402323, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7704, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.405469711690343, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7555, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.41785558744905005, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7296, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.39699871829153915, + "learning_rate": 8.548775361607872e-05, + "loss": 0.6315, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4027572504138232, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7537, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4350632254202678, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7564, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3939867904779663, + "learning_rate": 8.240757138008149e-05, + "loss": 0.6851, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.43939859961499766, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7935, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.48625092642856155, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7706, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4981537473001381, + "learning_rate": 7.934452157220694e-05, + "loss": 0.7447, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.43905428940129065, + "learning_rate": 7.832782828537437e-05, + "loss": 0.669, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.40314450778540883, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7944, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.47831491273949467, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7644, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.376804270353829, + "learning_rate": 7.52922585698315e-05, + "loss": 0.6484, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.40960972586278444, + "learning_rate": 7.428560370317542e-05, + "loss": 0.7668, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.47390937055957816, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7606, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.39995555750872014, + "learning_rate": 7.228075054658096e-05, + "loss": 0.6889, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.43246771778131426, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7386, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.4511103506749145, + "learning_rate": 7.028789546718326e-05, + "loss": 0.6925, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3744787661442005, + "learning_rate": 6.929623697981718e-05, + "loss": 0.6582, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4015516253801773, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6855, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.46137894759038195, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7506, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.36133467499460037, + "learning_rate": 6.63416243451194e-05, + "loss": 0.6634, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.398949905319442, + "learning_rate": 6.536389633212609e-05, + "loss": 0.7445, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.43191616189051685, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7167, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4382384369223291, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7552, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.364192338002942, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7017, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4273232505653493, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7702, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.45241456258728546, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6983, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.41870882008182425, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7223, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.36662315343893065, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7102, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.34240621241431185, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7169, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3986673895031164, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7466, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.4311966052909308, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.7265, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5541661302074115, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7585, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.44000575936807124, + "learning_rate": 5.395334294830765e-05, + "loss": 0.6811, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.47849947838940127, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7414, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.43089757195163225, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7657, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.44970102772398723, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.6995, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4017912074199865, + "learning_rate": 5.030059790200756e-05, + "loss": 0.6994, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.37774081653175595, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7418, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.4471206292757365, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7225, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4289588771903726, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7391, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.4957800599644603, + "learning_rate": 4.673389165531714e-05, + "loss": 0.6979, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4169874944494124, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7691, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3873593192271462, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7223, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.35372466810924785, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7342, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.41673004219163245, + "learning_rate": 4.325939883229766e-05, + "loss": 0.8016, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.3824442425010731, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7741, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.40158610632934527, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7634, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4244144316802503, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7989, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.6061648550240569, + "learning_rate": 3.988313441862553e-05, + "loss": 0.719, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3792471751631609, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.735, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.3380204782373729, + "learning_rate": 3.823367132865265e-05, + "loss": 0.6407, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4869562528787163, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7544, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.5001594842961744, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7652, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.45072491971435924, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7817, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4100150948448548, + "learning_rate": 3.501565286440914e-05, + "loss": 0.6942, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.40874454525721804, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7549, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4260284237081343, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7028, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3552812102786192, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.681, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.44069685250106094, + "learning_rate": 3.191013424895536e-05, + "loss": 0.8137, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4690337765690944, + "learning_rate": 3.115196713638e-05, + "loss": 0.7733, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.3741311567109358, + "learning_rate": 3.040125031897264e-05, + "loss": 0.699, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4028572108946936, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.6446, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.45854603721647536, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7475, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3681731161185109, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7596, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3776434010253435, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7007, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.3738418901709509, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7149, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4835092594634167, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7032, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4017853036129437, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6546, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4480182382215609, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6762, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3812030302518587, + "learning_rate": 2.399319354583418e-05, + "loss": 0.6977, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.40746694197290256, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7143, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4111923252375617, + "learning_rate": 2.265772503450122e-05, + "loss": 0.6488, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4073753912868798, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.683, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.388898515608764, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7862, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.385706083912587, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.6774, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.41895090641408606, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7269, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.47082427321169534, + "learning_rate": 1.946674459180955e-05, + "loss": 0.745, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.38248626438781763, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.6811, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4254072536431337, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7069, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.37837697092083017, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6164, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.38604759856266024, + "learning_rate": 1.707039794428259e-05, + "loss": 0.7188, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3871010273288266, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7856, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.36665735756837253, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6653, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.41254820532951914, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7099, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4222647650532378, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7257, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.3622288431601804, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7196, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.43802019298705874, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7194, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.515059265768125, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7283, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.42005833634395573, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.6789, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.43683440181136446, + "learning_rate": 1.220944973160133e-05, + "loss": 0.6844, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.4115772545929074, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.732, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.34094173712492826, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.6698, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.40968587705008036, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7368, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.341086694470386, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.652, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.39824391443335655, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7309, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.40651294928363996, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6989, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.47718296599282495, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7175, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4288787554748259, + "learning_rate": 8.532975781620512e-06, + "loss": 0.6879, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4376532859776581, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7295, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4050176075548045, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7493, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.41762319521896446, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.6795, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.5144078304017623, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7602, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4664003629503036, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7801, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.428652215880614, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7168, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.39893922809100696, + "learning_rate": 5.834646773481811e-06, + "loss": 0.726, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.3518817379354701, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6395, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.4669389577355934, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7581, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.38174878134103185, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7058, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.40214897406185196, + "learning_rate": 4.516017865659949e-06, + "loss": 0.6688, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.5337589314562605, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7816, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.3904988161266009, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7557, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.38486028703867425, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7289, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4776346872652581, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7448, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.43484107579010384, + "learning_rate": 3.100425628282899e-06, + "loss": 0.6657, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.36820384941205775, + "learning_rate": 2.848647830172024e-06, + "loss": 0.7404, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.3970556699438135, + "learning_rate": 2.607383131993424e-06, + "loss": 0.6972, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 2.58235750777289, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.67, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5069706192556233, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7102, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4923262833153959, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7609, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4142497515699397, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.6891, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.3792544487029188, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7453, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.45932577189942847, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7927, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.39999104491608034, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6499, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4266247591676935, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7357, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.40088208414107185, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7559, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.37949174127982427, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7175, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4595268076207302, + "learning_rate": 6.539842600603918e-07, + "loss": 0.7367, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.41448073681420394, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7889, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.45148737131128025, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7616, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.5663093229287312, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7589, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.44329650062299025, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7482, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4885780490332609, + "learning_rate": 1.947230525005006e-07, + "loss": 0.796, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.47646806559659416, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.788, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4473469054171859, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7049, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.36500013878681864, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.6675, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.3952870718353004, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7168, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.35403001404819195, + "learning_rate": 5.410681219286673e-09, + "loss": 0.7338, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4191626699523053, + "learning_rate": 0.0, + "loss": 0.7846, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 273934979694592.0, + "train_loss": 0.782174748296921, + "train_runtime": 4830.1855, + "train_samples_per_second": 1.035, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 273934979694592.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b26b3f5e148b66cdcc899e12b200aec7e66dd956 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f9b1ecab6d4efc4660af745ca103f2591711b9e8 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4b8a982da8abdd36e0f519116503d61efcac7cfc555ef7920a46d13be3e28d0 +size 671150064 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..32ba8d5e6d8d938e923bb098c9909f532859a5b7 --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:090d67a5e8b539a067a31c2fbac49c8951630d0bacd85d8940f5e5a92bb6c5d5 +size 918507402 diff --git a/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..75489b583d64b22e5968095b01180eac92b9cecd --- /dev/null +++ b/mixing_strategies/Proportional/bugsBunny-v1_1-Llama-3-8B-V-Proportional_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,1134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064, + "grad_norm": 1.033650962864784, + "learning_rate": 4e-05, + "loss": 1.5395, + "step": 1 + }, + { + "epoch": 0.0128, + "grad_norm": 0.9072482054967783, + "learning_rate": 8e-05, + "loss": 1.4104, + "step": 2 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6929696257997826, + "learning_rate": 0.00012, + "loss": 1.2895, + "step": 3 + }, + { + "epoch": 0.0256, + "grad_norm": 1.2743831532728433, + "learning_rate": 0.00016, + "loss": 1.2358, + "step": 4 + }, + { + "epoch": 0.032, + "grad_norm": 0.9635249398731617, + "learning_rate": 0.0002, + "loss": 1.0529, + "step": 5 + }, + { + "epoch": 0.0384, + "grad_norm": 0.7620736106367577, + "learning_rate": 0.0001999783578606323, + "loss": 0.9811, + "step": 6 + }, + { + "epoch": 0.0448, + "grad_norm": 0.548706344742551, + "learning_rate": 0.0001999134408101731, + "loss": 0.9539, + "step": 7 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4838581066198791, + "learning_rate": 0.00019980527694749952, + "loss": 0.9446, + "step": 8 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4729470711419524, + "learning_rate": 0.0001996539130905593, + "loss": 0.9248, + "step": 9 + }, + { + "epoch": 0.064, + "grad_norm": 0.4410831370616841, + "learning_rate": 0.00019945941475610623, + "loss": 0.9363, + "step": 10 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4263271322679063, + "learning_rate": 0.0001992218661313415, + "loss": 0.9441, + "step": 11 + }, + { + "epoch": 0.0768, + "grad_norm": 0.43101174777747714, + "learning_rate": 0.00019894137003747403, + "loss": 0.8779, + "step": 12 + }, + { + "epoch": 0.0832, + "grad_norm": 0.37861484383938987, + "learning_rate": 0.00019861804788521493, + "loss": 0.915, + "step": 13 + }, + { + "epoch": 0.0896, + "grad_norm": 0.3801259524726845, + "learning_rate": 0.00019825203962222572, + "loss": 0.8727, + "step": 14 + }, + { + "epoch": 0.096, + "grad_norm": 0.3915827921639002, + "learning_rate": 0.00019784350367254322, + "loss": 0.8938, + "step": 15 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3476027967669314, + "learning_rate": 0.0001973926168680066, + "loss": 0.8467, + "step": 16 + }, + { + "epoch": 0.1088, + "grad_norm": 0.3818502534495098, + "learning_rate": 0.0001968995743717171, + "loss": 0.9224, + "step": 17 + }, + { + "epoch": 0.1152, + "grad_norm": 0.37169132676026895, + "learning_rate": 0.00019636458959356316, + "loss": 0.8099, + "step": 18 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3396087648907952, + "learning_rate": 0.00019578789409784727, + "loss": 0.8625, + "step": 19 + }, + { + "epoch": 0.128, + "grad_norm": 0.381238289245554, + "learning_rate": 0.00019516973750305532, + "loss": 0.8612, + "step": 20 + }, + { + "epoch": 0.1344, + "grad_norm": 0.38304900844693557, + "learning_rate": 0.00019451038737381077, + "loss": 0.8244, + "step": 21 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3592327009433714, + "learning_rate": 0.00019381012910506146, + "loss": 0.8547, + "step": 22 + }, + { + "epoch": 0.1472, + "grad_norm": 0.40916090982578535, + "learning_rate": 0.00019306926579854821, + "loss": 0.8672, + "step": 23 + }, + { + "epoch": 0.1536, + "grad_norm": 0.3635660633456493, + "learning_rate": 0.0001922881181316097, + "loss": 0.843, + "step": 24 + }, + { + "epoch": 0.16, + "grad_norm": 0.3434073608671318, + "learning_rate": 0.0001914670242183795, + "loss": 0.8477, + "step": 25 + }, + { + "epoch": 0.1664, + "grad_norm": 0.32796730748027947, + "learning_rate": 0.0001906063394634356, + "loss": 0.7971, + "step": 26 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3090897990220882, + "learning_rate": 0.00018970643640796642, + "loss": 0.8387, + "step": 27 + }, + { + "epoch": 0.1792, + "grad_norm": 0.35001291264577755, + "learning_rate": 0.00018876770456851877, + "loss": 0.8317, + "step": 28 + }, + { + "epoch": 0.1856, + "grad_norm": 0.36428491335127045, + "learning_rate": 0.00018779055026839868, + "loss": 0.8841, + "step": 29 + }, + { + "epoch": 0.192, + "grad_norm": 0.3243954309502511, + "learning_rate": 0.00018677539646179707, + "loss": 0.8171, + "step": 30 + }, + { + "epoch": 0.1984, + "grad_norm": 0.33128151860617544, + "learning_rate": 0.00018572268255071718, + "loss": 0.8143, + "step": 31 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3243232431069116, + "learning_rate": 0.00018463286419478255, + "loss": 0.7925, + "step": 32 + }, + { + "epoch": 0.2112, + "grad_norm": 0.31250678604816295, + "learning_rate": 0.00018350641311400812, + "loss": 0.812, + "step": 33 + }, + { + "epoch": 0.2176, + "grad_norm": 0.32678858604781935, + "learning_rate": 0.00018234381688461942, + "loss": 0.8368, + "step": 34 + }, + { + "epoch": 0.224, + "grad_norm": 0.35597874869490725, + "learning_rate": 0.00018114557872800905, + "loss": 0.8239, + "step": 35 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3089808870901914, + "learning_rate": 0.0001799122172929206, + "loss": 0.8152, + "step": 36 + }, + { + "epoch": 0.2368, + "grad_norm": 0.2946998902064204, + "learning_rate": 0.0001786442664309554, + "loss": 0.8018, + "step": 37 + }, + { + "epoch": 0.2432, + "grad_norm": 0.30147846071695233, + "learning_rate": 0.0001773422749654988, + "loss": 0.8034, + "step": 38 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3495620439449062, + "learning_rate": 0.00017600680645416583, + "loss": 0.7917, + "step": 39 + }, + { + "epoch": 0.256, + "grad_norm": 0.35017459683865043, + "learning_rate": 0.00017463843894486937, + "loss": 0.8415, + "step": 40 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3255738294856779, + "learning_rate": 0.00017323776472561627, + "loss": 0.8381, + "step": 41 + }, + { + "epoch": 0.2688, + "grad_norm": 0.32143555884545844, + "learning_rate": 0.0001718053900681397, + "loss": 0.8136, + "step": 42 + }, + { + "epoch": 0.2752, + "grad_norm": 0.35224643730658256, + "learning_rate": 0.00017034193496547902, + "loss": 0.7686, + "step": 43 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3330551520720871, + "learning_rate": 0.00016884803286362, + "loss": 0.8107, + "step": 44 + }, + { + "epoch": 0.288, + "grad_norm": 0.33671933143502814, + "learning_rate": 0.00016732433038731242, + "loss": 0.8266, + "step": 45 + }, + { + "epoch": 0.2944, + "grad_norm": 0.33238469396347947, + "learning_rate": 0.00016577148706018328, + "loss": 0.7786, + "step": 46 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3119497080343661, + "learning_rate": 0.00016419017501926656, + "loss": 0.812, + "step": 47 + }, + { + "epoch": 0.3072, + "grad_norm": 0.33556678648575605, + "learning_rate": 0.00016258107872407375, + "loss": 0.7689, + "step": 48 + }, + { + "epoch": 0.3136, + "grad_norm": 0.29754676091345855, + "learning_rate": 0.00016094489466033043, + "loss": 0.7856, + "step": 49 + }, + { + "epoch": 0.32, + "grad_norm": 0.3212550236302518, + "learning_rate": 0.0001592823310385073, + "loss": 0.8287, + "step": 50 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3223260509089707, + "learning_rate": 0.00015759410748727662, + "loss": 0.7134, + "step": 51 + }, + { + "epoch": 0.3328, + "grad_norm": 0.32472077807782423, + "learning_rate": 0.00015588095474202595, + "loss": 0.7696, + "step": 52 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3144612888628789, + "learning_rate": 0.00015414361432856475, + "loss": 0.7227, + "step": 53 + }, + { + "epoch": 0.3456, + "grad_norm": 0.31748264767086415, + "learning_rate": 0.00015238283824216015, + "loss": 0.7652, + "step": 54 + }, + { + "epoch": 0.352, + "grad_norm": 0.33053354962059733, + "learning_rate": 0.00015059938862204127, + "loss": 0.7867, + "step": 55 + }, + { + "epoch": 0.3584, + "grad_norm": 0.31913427175442033, + "learning_rate": 0.00014879403742151283, + "loss": 0.7623, + "step": 56 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3338267291180613, + "learning_rate": 0.0001469675660738206, + "loss": 0.8261, + "step": 57 + }, + { + "epoch": 0.3712, + "grad_norm": 0.31258460736200183, + "learning_rate": 0.00014512076515391375, + "loss": 0.8039, + "step": 58 + }, + { + "epoch": 0.3776, + "grad_norm": 0.35243368970699207, + "learning_rate": 0.0001432544340362501, + "loss": 0.8099, + "step": 59 + }, + { + "epoch": 0.384, + "grad_norm": 0.30397586139928945, + "learning_rate": 0.00014136938054879283, + "loss": 0.7435, + "step": 60 + }, + { + "epoch": 0.3904, + "grad_norm": 0.2747692975616536, + "learning_rate": 0.00013946642062334766, + "loss": 0.7764, + "step": 61 + }, + { + "epoch": 0.3968, + "grad_norm": 0.29199473562926864, + "learning_rate": 0.000137546377942393, + "loss": 0.7222, + "step": 62 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3078900971458151, + "learning_rate": 0.00013561008358255468, + "loss": 0.7792, + "step": 63 + }, + { + "epoch": 0.4096, + "grad_norm": 0.324691531513113, + "learning_rate": 0.00013365837565488064, + "loss": 0.798, + "step": 64 + }, + { + "epoch": 0.416, + "grad_norm": 0.3335964441250956, + "learning_rate": 0.0001316920989420703, + "loss": 0.8066, + "step": 65 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3373556037280127, + "learning_rate": 0.00012971210453281674, + "loss": 0.793, + "step": 66 + }, + { + "epoch": 0.4288, + "grad_norm": 0.32941353137034224, + "learning_rate": 0.00012771924945341906, + "loss": 0.81, + "step": 67 + }, + { + "epoch": 0.4352, + "grad_norm": 0.35399403922854145, + "learning_rate": 0.0001257143962968246, + "loss": 0.7501, + "step": 68 + }, + { + "epoch": 0.4416, + "grad_norm": 0.34917009678959915, + "learning_rate": 0.00012369841284926188, + "loss": 0.789, + "step": 69 + }, + { + "epoch": 0.448, + "grad_norm": 0.33376866968787544, + "learning_rate": 0.00012167217171462566, + "loss": 0.7425, + "step": 70 + }, + { + "epoch": 0.4544, + "grad_norm": 0.33811642179908985, + "learning_rate": 0.00011963654993677645, + "loss": 0.7682, + "step": 71 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3425051696044902, + "learning_rate": 0.00011759242861991855, + "loss": 0.7822, + "step": 72 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3107034281284392, + "learning_rate": 0.00011554069254722051, + "loss": 0.7819, + "step": 73 + }, + { + "epoch": 0.4736, + "grad_norm": 0.33282426680005467, + "learning_rate": 0.00011348222979784289, + "loss": 0.815, + "step": 74 + }, + { + "epoch": 0.48, + "grad_norm": 0.32805129818689494, + "learning_rate": 0.00011141793136253986, + "loss": 0.7319, + "step": 75 + }, + { + "epoch": 0.4864, + "grad_norm": 0.29585326468195644, + "learning_rate": 0.000109348690758, + "loss": 0.7203, + "step": 76 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3561659679909226, + "learning_rate": 0.0001072754036400944, + "loss": 0.7699, + "step": 77 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3469249098416054, + "learning_rate": 0.00010519896741619803, + "loss": 0.8084, + "step": 78 + }, + { + "epoch": 0.5056, + "grad_norm": 0.33422921005367806, + "learning_rate": 0.00010312028085675391, + "loss": 0.7931, + "step": 79 + }, + { + "epoch": 0.512, + "grad_norm": 0.3503998377141623, + "learning_rate": 0.00010104024370624644, + "loss": 0.796, + "step": 80 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3126441931642006, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7468, + "step": 81 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3021120067347973, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7493, + "step": 82 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3067566109741886, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7414, + "step": 83 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3094509650385033, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7324, + "step": 84 + }, + { + "epoch": 0.544, + "grad_norm": 0.30257407103089773, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7559, + "step": 85 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3216531345523285, + "learning_rate": 8.858206863746018e-05, + "loss": 0.748, + "step": 86 + }, + { + "epoch": 0.5568, + "grad_norm": 0.30249740732137886, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7437, + "step": 87 + }, + { + "epoch": 0.5632, + "grad_norm": 0.31341182206609874, + "learning_rate": 8.445930745277953e-05, + "loss": 0.6947, + "step": 88 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3024434846916235, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7259, + "step": 89 + }, + { + "epoch": 0.576, + "grad_norm": 0.3411376996056166, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7815, + "step": 90 + }, + { + "epoch": 0.5824, + "grad_norm": 0.2856733234825206, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7155, + "step": 91 + }, + { + "epoch": 0.5888, + "grad_norm": 0.33601917343194493, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7818, + "step": 92 + }, + { + "epoch": 0.5952, + "grad_norm": 0.2918013505386562, + "learning_rate": 7.428560370317542e-05, + "loss": 0.709, + "step": 93 + }, + { + "epoch": 0.6016, + "grad_norm": 0.31526319175179096, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7301, + "step": 94 + }, + { + "epoch": 0.608, + "grad_norm": 0.30250418306277116, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7195, + "step": 95 + }, + { + "epoch": 0.6144, + "grad_norm": 0.2790662847069248, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6778, + "step": 96 + }, + { + "epoch": 0.6208, + "grad_norm": 0.2998584844908892, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7082, + "step": 97 + }, + { + "epoch": 0.6272, + "grad_norm": 0.29565362571693427, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7342, + "step": 98 + }, + { + "epoch": 0.6336, + "grad_norm": 0.29865538765003863, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7361, + "step": 99 + }, + { + "epoch": 0.64, + "grad_norm": 0.30332168702872603, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7342, + "step": 100 + }, + { + "epoch": 0.6464, + "grad_norm": 0.29712882764596105, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7177, + "step": 101 + }, + { + "epoch": 0.6528, + "grad_norm": 0.2929311235146598, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7374, + "step": 102 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3118133634434965, + "learning_rate": 5.487923484608629e-05, + "loss": 0.744, + "step": 103 + }, + { + "epoch": 0.6656, + "grad_norm": 0.31646373104256637, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7162, + "step": 104 + }, + { + "epoch": 0.672, + "grad_norm": 0.3492955334143746, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7402, + "step": 105 + }, + { + "epoch": 0.6784, + "grad_norm": 0.29816146956270106, + "learning_rate": 4.940061137795876e-05, + "loss": 0.728, + "step": 106 + }, + { + "epoch": 0.6848, + "grad_norm": 0.32464952556140103, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7358, + "step": 107 + }, + { + "epoch": 0.6912, + "grad_norm": 0.32711705811548675, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7357, + "step": 108 + }, + { + "epoch": 0.6976, + "grad_norm": 0.2882275477535859, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7364, + "step": 109 + }, + { + "epoch": 0.704, + "grad_norm": 0.2933869942149808, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7941, + "step": 110 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3031355662381024, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7894, + "step": 111 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3060693032435272, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7371, + "step": 112 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3259781194694751, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7029, + "step": 113 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3702015020625932, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7746, + "step": 114 + }, + { + "epoch": 0.736, + "grad_norm": 0.30249066864439494, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7306, + "step": 115 + }, + { + "epoch": 0.7424, + "grad_norm": 0.2897777887282797, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7018, + "step": 116 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3344142843455273, + "learning_rate": 3.115196713638e-05, + "loss": 0.8018, + "step": 117 + }, + { + "epoch": 0.7552, + "grad_norm": 0.29515807103280634, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.6781, + "step": 118 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3025305424101093, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7593, + "step": 119 + }, + { + "epoch": 0.768, + "grad_norm": 0.28223422497881434, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7145, + "step": 120 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3066517795043625, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6869, + "step": 121 + }, + { + "epoch": 0.7808, + "grad_norm": 0.30642475984381673, + "learning_rate": 2.399319354583418e-05, + "loss": 0.6967, + "step": 122 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3006047300862463, + "learning_rate": 2.265772503450122e-05, + "loss": 0.6908, + "step": 123 + }, + { + "epoch": 0.7936, + "grad_norm": 0.2930985762959953, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7421, + "step": 124 + }, + { + "epoch": 0.8, + "grad_norm": 0.36699533414091595, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7152, + "step": 125 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3191184250838783, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7233, + "step": 126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.2882949563889631, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6723, + "step": 127 + }, + { + "epoch": 0.8192, + "grad_norm": 0.2846618831877869, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7624, + "step": 128 + }, + { + "epoch": 0.8256, + "grad_norm": 0.2859925434034348, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.695, + "step": 129 + }, + { + "epoch": 0.832, + "grad_norm": 0.2939429745246576, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7305, + "step": 130 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3265178745760517, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7318, + "step": 131 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3091109426117566, + "learning_rate": 1.220944973160133e-05, + "loss": 0.6894, + "step": 132 + }, + { + "epoch": 0.8512, + "grad_norm": 0.2756556893678473, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7089, + "step": 133 + }, + { + "epoch": 0.8576, + "grad_norm": 0.28711391805261105, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7045, + "step": 134 + }, + { + "epoch": 0.864, + "grad_norm": 0.3018136761543984, + "learning_rate": 9.393660536564408e-06, + "loss": 0.725, + "step": 135 + }, + { + "epoch": 0.8704, + "grad_norm": 0.48785789730683393, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7147, + "step": 136 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2946491236531151, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7513, + "step": 137 + }, + { + "epoch": 0.8832, + "grad_norm": 0.35417168372316066, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7318, + "step": 138 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3429059982297449, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7576, + "step": 139 + }, + { + "epoch": 0.896, + "grad_norm": 0.28996922983926465, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6904, + "step": 140 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3139948030812815, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7424, + "step": 141 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3313436912829056, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7348, + "step": 142 + }, + { + "epoch": 0.9152, + "grad_norm": 0.2862586930535613, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.755, + "step": 143 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3354881979829791, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7147, + "step": 144 + }, + { + "epoch": 0.928, + "grad_norm": 0.2781947839944141, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7277, + "step": 145 + }, + { + "epoch": 0.9344, + "grad_norm": 0.31691833779027606, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6999, + "step": 146 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3340163070707816, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7372, + "step": 147 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3170369724196319, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7801, + "step": 148 + }, + { + "epoch": 0.9536, + "grad_norm": 0.30594912415669967, + "learning_rate": 1.05862996252597e-06, + "loss": 0.702, + "step": 149 + }, + { + "epoch": 0.96, + "grad_norm": 0.28892681135303727, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7478, + "step": 150 + }, + { + "epoch": 0.9664, + "grad_norm": 0.32463030584671543, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7746, + "step": 151 + }, + { + "epoch": 0.9728, + "grad_norm": 0.37353097089776116, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.771, + "step": 152 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3437871535153094, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7815, + "step": 153 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3425422993987472, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7599, + "step": 154 + }, + { + "epoch": 0.992, + "grad_norm": 0.2730176554215713, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7038, + "step": 155 + }, + { + "epoch": 0.9984, + "grad_norm": 0.2825593416155209, + "learning_rate": 0.0, + "loss": 0.7707, + "step": 156 + }, + { + "epoch": 0.9984, + "step": 156, + "total_flos": 396278936240128.0, + "train_loss": 0.791970758101879, + "train_runtime": 4806.5232, + "train_samples_per_second": 1.04, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 156, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 396278936240128.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d226730e32a62891250be7d5ae91602ca108cb02 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "v_proj", + "q_proj", + "o_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cb2be08dd2722e760e6af1fdf9d73005ea729bd4 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:139128bcd4e4cf0bd4fa30166f904da62fbd0bbcd8e0c6ca5a8c343433679245 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f7ce6843e5d8686d1a2f7789e9524b2c9284534 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f28376e29671c5eb16c9c9ff151bb033cac16d4b7273c57016217ca38f0d5f5f +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..960fdcd74886c92021d74df11e31afce06029319 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 1.0078361387590205, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4793, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9951652887052043, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3539, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.896763252301588, + "learning_rate": 3.157894736842105e-05, + "loss": 1.4285, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8977678456462442, + "learning_rate": 4.210526315789474e-05, + "loss": 1.466, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.7792622012488023, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2075, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.9015724813023276, + "learning_rate": 6.31578947368421e-05, + "loss": 1.2588, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.9539912695680726, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1511, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.4709119751155775, + "learning_rate": 8.421052631578948e-05, + "loss": 1.133, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.8781207054143169, + "learning_rate": 9.473684210526316e-05, + "loss": 1.0133, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.7745071179422164, + "learning_rate": 0.00010526315789473685, + "loss": 1.0139, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6668394118072065, + "learning_rate": 0.00011578947368421053, + "loss": 0.9199, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5593316156179626, + "learning_rate": 0.0001263157894736842, + "loss": 0.9649, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6349418219797407, + "learning_rate": 0.0001368421052631579, + "loss": 0.9399, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6020677206864543, + "learning_rate": 0.00014736842105263158, + "loss": 0.9946, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.5438695848971745, + "learning_rate": 0.00015789473684210527, + "loss": 0.93, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 1.6406972588190947, + "learning_rate": 0.00016842105263157895, + "loss": 0.8839, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.9803440490535646, + "learning_rate": 0.00017894736842105264, + "loss": 0.8615, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5593171972828492, + "learning_rate": 0.00018947368421052632, + "loss": 0.9058, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.47453770192222816, + "learning_rate": 0.0002, + "loss": 0.8859, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.5909841084059502, + "learning_rate": 0.00019999865623437013, + "loss": 0.9839, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5533211598200701, + "learning_rate": 0.00019999462497359466, + "loss": 0.9142, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5058722622366572, + "learning_rate": 0.00019998790632601496, + "loss": 0.9088, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5227840384513545, + "learning_rate": 0.0001999785004721968, + "loss": 0.9214, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.674085142832656, + "learning_rate": 0.00019996640766492543, + "loss": 0.9281, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.5157118294581174, + "learning_rate": 0.00019995162822919883, + "loss": 0.8217, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5218051567846262, + "learning_rate": 0.00019993416256221895, + "loss": 0.9299, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5422848174392554, + "learning_rate": 0.00019991401113338104, + "loss": 0.8486, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5486004540317999, + "learning_rate": 0.00019989117448426108, + "loss": 0.8745, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5662083598242861, + "learning_rate": 0.00019986565322860115, + "loss": 0.9145, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.5381570540759785, + "learning_rate": 0.00019983744805229296, + "loss": 0.908, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5211311574621448, + "learning_rate": 0.00019980655971335945, + "loss": 0.853, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.6184615559840395, + "learning_rate": 0.00019977298904193437, + "loss": 0.9835, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5179203492136157, + "learning_rate": 0.00019973673694024, + "loss": 0.8231, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5422366829439323, + "learning_rate": 0.00019969780438256293, + "loss": 0.9447, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.4753478694892907, + "learning_rate": 0.0001996561924152278, + "loss": 0.8857, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4920573954678667, + "learning_rate": 0.0001996119021565693, + "loss": 0.8765, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4963933312524518, + "learning_rate": 0.0001995649347969019, + "loss": 0.8807, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5048401993711281, + "learning_rate": 0.00019951529159848805, + "loss": 0.8142, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4966672780201818, + "learning_rate": 0.00019946297389550433, + "loss": 0.9338, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.49280895715078565, + "learning_rate": 0.00019940798309400526, + "loss": 0.8611, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5812776144179074, + "learning_rate": 0.0001993503206718859, + "loss": 0.9696, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4418823813577595, + "learning_rate": 0.00019928998817884182, + "loss": 0.7965, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.6151833185403617, + "learning_rate": 0.00019922698723632767, + "loss": 0.9346, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6356997114918449, + "learning_rate": 0.00019916131953751342, + "loss": 0.83, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.5937124928265956, + "learning_rate": 0.00019909298684723904, + "loss": 0.8051, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.47768756181848726, + "learning_rate": 0.00019902199100196697, + "loss": 0.822, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.6140160811543658, + "learning_rate": 0.00019894833390973266, + "loss": 0.8776, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.516801826502444, + "learning_rate": 0.00019887201755009357, + "loss": 0.7676, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.6327146305470971, + "learning_rate": 0.0001987930439740757, + "loss": 0.9702, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.499767257417617, + "learning_rate": 0.00019871141530411853, + "loss": 0.9066, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4924254887419153, + "learning_rate": 0.0001986271337340182, + "loss": 0.8891, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4290292230420388, + "learning_rate": 0.00019854020152886814, + "loss": 0.7612, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.4408267627156317, + "learning_rate": 0.0001984506210249986, + "loss": 0.8049, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5494028698920849, + "learning_rate": 0.00019835839462991361, + "loss": 0.9337, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.5377528312119375, + "learning_rate": 0.00019826352482222638, + "loss": 0.9272, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4653086051429184, + "learning_rate": 0.00019816601415159263, + "loss": 0.8384, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.47802147106541004, + "learning_rate": 0.0001980658652386421, + "loss": 0.8446, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.43698095251762437, + "learning_rate": 0.00019796308077490817, + "loss": 0.7921, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.496409999959844, + "learning_rate": 0.00019785766352275542, + "loss": 0.8973, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.5212049672896456, + "learning_rate": 0.00019774961631530545, + "loss": 0.8597, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.5489720426969512, + "learning_rate": 0.00019763894205636072, + "loss": 0.8903, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4622687412170286, + "learning_rate": 0.00019752564372032657, + "loss": 0.8887, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.47706889510794903, + "learning_rate": 0.00019740972435213115, + "loss": 0.8442, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5376371336217138, + "learning_rate": 0.00019729118706714375, + "loss": 0.8536, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.5021900107320479, + "learning_rate": 0.00019717003505109095, + "loss": 0.8365, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.49048052824057126, + "learning_rate": 0.00019704627155997108, + "loss": 0.8106, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5047693137325724, + "learning_rate": 0.00019691989991996663, + "loss": 0.8552, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5085475864936128, + "learning_rate": 0.0001967909235273549, + "loss": 0.841, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5094769407721215, + "learning_rate": 0.00019665934584841682, + "loss": 0.8844, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.5874346325784344, + "learning_rate": 0.00019652517041934356, + "loss": 0.9541, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.5113285708843512, + "learning_rate": 0.00019638840084614182, + "loss": 0.8147, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.6470490657139865, + "learning_rate": 0.00019624904080453655, + "loss": 0.9404, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.444158830017984, + "learning_rate": 0.00019610709403987246, + "loss": 0.7675, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.42718865954168317, + "learning_rate": 0.00019596256436701324, + "loss": 0.8347, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.49382933622072317, + "learning_rate": 0.000195815455670239, + "loss": 0.8569, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.44584676514435373, + "learning_rate": 0.00019566577190314197, + "loss": 0.7556, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4583298153167484, + "learning_rate": 0.0001955135170885202, + "loss": 0.8167, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4487175680774925, + "learning_rate": 0.00019535869531826937, + "loss": 0.7373, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5304376805035587, + "learning_rate": 0.00019520131075327298, + "loss": 0.7214, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.49477371360533506, + "learning_rate": 0.00019504136762329047, + "loss": 0.8556, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.47898340285063606, + "learning_rate": 0.00019487887022684336, + "loss": 0.7639, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.47588171820515307, + "learning_rate": 0.00019471382293110003, + "loss": 0.8284, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5177341087333374, + "learning_rate": 0.00019454623017175812, + "loss": 0.7955, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4972607753633679, + "learning_rate": 0.00019437609645292546, + "loss": 0.8604, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.433135078554396, + "learning_rate": 0.0001942034263469989, + "loss": 0.7556, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.43332045728202956, + "learning_rate": 0.00019402822449454153, + "loss": 0.7386, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 1.3570392266410942, + "learning_rate": 0.00019385049560415794, + "loss": 0.8819, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5174999688865969, + "learning_rate": 0.00019367024445236754, + "loss": 0.8708, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.5186816574437936, + "learning_rate": 0.00019348747588347637, + "loss": 0.7533, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.4777235971030546, + "learning_rate": 0.00019330219480944694, + "loss": 0.7657, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4589332179072561, + "learning_rate": 0.00019311440620976597, + "loss": 0.7995, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4365452209568981, + "learning_rate": 0.0001929241151313108, + "loss": 0.8211, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.5361492053581633, + "learning_rate": 0.00019273132668821364, + "loss": 0.7863, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5774950306753667, + "learning_rate": 0.00019253604606172417, + "loss": 0.7324, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.5311524204034596, + "learning_rate": 0.00019233827850007027, + "loss": 0.7907, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5211202018919038, + "learning_rate": 0.00019213802931831696, + "loss": 0.8306, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4493531014468885, + "learning_rate": 0.00019193530389822363, + "loss": 0.7914, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.489894272189929, + "learning_rate": 0.00019173010768809933, + "loss": 0.83, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.5084569157516184, + "learning_rate": 0.0001915224462026563, + "loss": 0.8685, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.4402306624297196, + "learning_rate": 0.00019131232502286188, + "loss": 0.8289, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.44943832297185154, + "learning_rate": 0.0001910997497957885, + "loss": 0.8216, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5389533019974405, + "learning_rate": 0.00019088472623446183, + "loss": 0.945, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.4105928618047756, + "learning_rate": 0.00019066726011770726, + "loss": 0.7293, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4966495166041549, + "learning_rate": 0.0001904473572899947, + "loss": 0.7914, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.5620926313915939, + "learning_rate": 0.00019022502366128135, + "loss": 0.8699, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4998050408498213, + "learning_rate": 0.00019000026520685302, + "loss": 0.7713, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4356477744811087, + "learning_rate": 0.0001897730879671634, + "loss": 0.8587, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5255567615971235, + "learning_rate": 0.00018954349804767184, + "loss": 0.8821, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4979809254631378, + "learning_rate": 0.00018931150161867916, + "loss": 0.776, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.4362326947314551, + "learning_rate": 0.00018907710491516199, + "loss": 0.8105, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4893990121965114, + "learning_rate": 0.0001888403142366049, + "loss": 0.8302, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.41384047619369413, + "learning_rate": 0.00018860113594683148, + "loss": 0.7777, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5179890724609715, + "learning_rate": 0.00018835957647383303, + "loss": 0.8515, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4749814963541095, + "learning_rate": 0.00018811564230959588, + "loss": 0.8121, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.4605472390929984, + "learning_rate": 0.00018786934000992688, + "loss": 0.7584, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4817745217154679, + "learning_rate": 0.00018762067619427746, + "loss": 0.8047, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.513266584644474, + "learning_rate": 0.00018736965754556528, + "loss": 0.7862, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.49942743495151193, + "learning_rate": 0.00018711629080999504, + "loss": 0.8251, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4486948636118088, + "learning_rate": 0.00018686058279687698, + "loss": 0.8265, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.4441154164066126, + "learning_rate": 0.00018660254037844388, + "loss": 0.8011, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4885346589356598, + "learning_rate": 0.00018634217048966637, + "loss": 0.7148, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5136035090126265, + "learning_rate": 0.0001860794801280666, + "loss": 0.8426, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5877167718087726, + "learning_rate": 0.0001858144763535302, + "loss": 0.8982, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4427950970842086, + "learning_rate": 0.0001855471662881164, + "loss": 0.7747, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.49655525453326005, + "learning_rate": 0.00018527755711586678, + "loss": 0.7655, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5534762211200998, + "learning_rate": 0.00018500565608261214, + "loss": 0.8763, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5530009730644346, + "learning_rate": 0.00018473147049577774, + "loss": 0.8714, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5657937559041026, + "learning_rate": 0.00018445500772418697, + "loss": 0.9082, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.48622536048605935, + "learning_rate": 0.00018417627519786315, + "loss": 0.8297, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.5158794849276632, + "learning_rate": 0.00018389528040783012, + "loss": 0.8475, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5045881311097942, + "learning_rate": 0.00018361203090591071, + "loss": 0.8018, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.44554526837359076, + "learning_rate": 0.00018332653430452376, + "loss": 0.7265, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.47940295287330836, + "learning_rate": 0.00018303879827647975, + "loss": 0.9335, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.6038453663214071, + "learning_rate": 0.00018274883055477436, + "loss": 0.849, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.44736082282588496, + "learning_rate": 0.00018245663893238075, + "loss": 0.7963, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4949858443463578, + "learning_rate": 0.00018216223126204007, + "loss": 0.8341, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.4748678665983683, + "learning_rate": 0.00018186561545605054, + "loss": 0.8569, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4682317494097038, + "learning_rate": 0.00018156679948605467, + "loss": 0.7801, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.49593152677518926, + "learning_rate": 0.00018126579138282503, + "loss": 0.7977, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.5112040698042739, + "learning_rate": 0.0001809625992360485, + "loss": 0.8287, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5144222579666844, + "learning_rate": 0.00018065723119410884, + "loss": 0.8476, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4488921177561266, + "learning_rate": 0.00018034969546386757, + "loss": 0.786, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.43113259073368637, + "learning_rate": 0.0001800400003104436, + "loss": 0.7908, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.46919227137644687, + "learning_rate": 0.00017972815405699103, + "loss": 0.843, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.4857406827968805, + "learning_rate": 0.00017941416508447536, + "loss": 0.8096, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4718601050025545, + "learning_rate": 0.0001790980418314484, + "loss": 0.8514, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5026802643275337, + "learning_rate": 0.00017877979279382135, + "loss": 0.8073, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4380565959244541, + "learning_rate": 0.0001784594265246366, + "loss": 0.7799, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.40950675857094365, + "learning_rate": 0.0001781369516338378, + "loss": 0.7193, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.5496617762158832, + "learning_rate": 0.00017781237678803847, + "loss": 0.8451, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.44702446743165664, + "learning_rate": 0.000177485710710289, + "loss": 0.7141, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4148928347211178, + "learning_rate": 0.00017715696217984235, + "loss": 0.7337, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.508432249555421, + "learning_rate": 0.00017682614003191807, + "loss": 0.8465, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.430962234328245, + "learning_rate": 0.00017649325315746478, + "loss": 0.7478, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.44406799965662086, + "learning_rate": 0.0001761583105029213, + "loss": 0.843, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4534186344976709, + "learning_rate": 0.00017582132106997616, + "loss": 0.7591, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.38372442007494273, + "learning_rate": 0.00017548229391532572, + "loss": 0.7459, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.45382619962872733, + "learning_rate": 0.00017514123815043074, + "loss": 0.7637, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.6556052367907826, + "learning_rate": 0.00017479816294127152, + "loss": 0.7904, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.4755089731315473, + "learning_rate": 0.0001744530775081015, + "loss": 0.7667, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5616630620433459, + "learning_rate": 0.0001741059911251997, + "loss": 0.9078, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.47033499421903646, + "learning_rate": 0.000173756913120621, + "loss": 0.8548, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5037243767448392, + "learning_rate": 0.00017340585287594604, + "loss": 0.9103, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4203751041485349, + "learning_rate": 0.0001730528198260285, + "loss": 0.7424, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.411831853489757, + "learning_rate": 0.00017269782345874203, + "loss": 0.7919, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5326554379731826, + "learning_rate": 0.00017234087331472497, + "loss": 0.8259, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4403852288388378, + "learning_rate": 0.00017198197898712404, + "loss": 0.763, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5821163850741805, + "learning_rate": 0.00017162115012133643, + "loss": 0.9002, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.591727532299676, + "learning_rate": 0.00017125839641475072, + "loss": 0.8832, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.46804647996398024, + "learning_rate": 0.00017089372761648616, + "loss": 0.7929, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.42203323854415953, + "learning_rate": 0.00017052715352713075, + "loss": 0.733, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.46752128447350355, + "learning_rate": 0.00017015868399847768, + "loss": 0.7937, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.46846317463295867, + "learning_rate": 0.00016978832893326074, + "loss": 0.8266, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5263210553238438, + "learning_rate": 0.00016941609828488807, + "loss": 0.7909, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.47749438944518474, + "learning_rate": 0.0001690420020571747, + "loss": 0.7531, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5645333688308465, + "learning_rate": 0.0001686660503040737, + "loss": 0.8229, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.446871568864066, + "learning_rate": 0.00016828825312940592, + "loss": 0.7403, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5520991124359584, + "learning_rate": 0.0001679086206865886, + "loss": 0.812, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4963547298286559, + "learning_rate": 0.00016752716317836229, + "loss": 0.8204, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.474444324347614, + "learning_rate": 0.0001671438908565167, + "loss": 0.8623, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4738044014485687, + "learning_rate": 0.00016675881402161536, + "loss": 0.715, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.6103325138954283, + "learning_rate": 0.0001663719430227186, + "loss": 0.8508, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.5647299154567543, + "learning_rate": 0.00016598328825710533, + "loss": 0.8472, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.41521843096999544, + "learning_rate": 0.000165592860169994, + "loss": 0.736, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.4813369439604522, + "learning_rate": 0.00016520066925426144, + "loss": 0.7744, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4656646536960523, + "learning_rate": 0.0001648067260501611, + "loss": 0.8342, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.44361921983143243, + "learning_rate": 0.0001644110411450398, + "loss": 0.8086, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4105783992742901, + "learning_rate": 0.00016401362517305296, + "loss": 0.767, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5209585768818658, + "learning_rate": 0.00016361448881487914, + "loss": 0.7964, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.507362190210602, + "learning_rate": 0.00016321364279743266, + "loss": 0.8028, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.49789901798638553, + "learning_rate": 0.0001628110978935756, + "loss": 0.8082, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.46776010139530455, + "learning_rate": 0.00016240686492182804, + "loss": 0.7808, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.43939402349934514, + "learning_rate": 0.00016200095474607753, + "loss": 0.7466, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.45520971799451754, + "learning_rate": 0.00016159337827528685, + "loss": 0.8107, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.45900953614309165, + "learning_rate": 0.0001611841464632011, + "loss": 0.7699, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.43840709703169906, + "learning_rate": 0.0001607732703080532, + "loss": 0.7479, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.43202304168661726, + "learning_rate": 0.00016036076085226814, + "loss": 0.6823, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.43563508758389163, + "learning_rate": 0.0001599466291821666, + "loss": 0.8153, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.448063176823589, + "learning_rate": 0.0001595308864276666, + "loss": 0.7478, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.48558137728839135, + "learning_rate": 0.0001591135437619847, + "loss": 0.7288, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.508285060888712, + "learning_rate": 0.0001586946124013354, + "loss": 0.8239, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.40613474777069003, + "learning_rate": 0.0001582741036046301, + "loss": 0.7125, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.3996910851018506, + "learning_rate": 0.00015785202867317407, + "loss": 0.7347, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.43747574374571657, + "learning_rate": 0.00015742839895036305, + "loss": 0.6955, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.5165934739549206, + "learning_rate": 0.00015700322582137827, + "loss": 0.8473, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3886022920700298, + "learning_rate": 0.0001565765207128805, + "loss": 0.7759, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3957656254609609, + "learning_rate": 0.0001561482950927029, + "loss": 0.697, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.4484079810056674, + "learning_rate": 0.00015571856046954285, + "loss": 0.7516, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4529369609607294, + "learning_rate": 0.00015528732839265272, + "loss": 0.7369, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.5335332069111763, + "learning_rate": 0.0001548546104515294, + "loss": 0.8667, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5285752487579878, + "learning_rate": 0.00015442041827560274, + "loss": 0.8172, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3969427948794199, + "learning_rate": 0.00015398476353392323, + "loss": 0.7163, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4410382415479905, + "learning_rate": 0.00015354765793484834, + "loss": 0.7776, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4159115145082527, + "learning_rate": 0.00015310911322572753, + "loss": 0.7557, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.4742973139852936, + "learning_rate": 0.000152669141192587, + "loss": 0.7229, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5216060088398573, + "learning_rate": 0.00015222775365981273, + "loss": 0.8442, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.43771511199061625, + "learning_rate": 0.00015178496248983254, + "loss": 0.7495, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.572707160140616, + "learning_rate": 0.00015134077958279765, + "loss": 0.8126, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5544836753126516, + "learning_rate": 0.00015089521687626243, + "loss": 0.8129, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.4532086857255862, + "learning_rate": 0.000150448286344864, + "loss": 0.8037, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4918849781329419, + "learning_rate": 0.00015000000000000001, + "loss": 0.82, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.41644324342222866, + "learning_rate": 0.00014955036988950618, + "loss": 0.8029, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3963250611740579, + "learning_rate": 0.00014909940809733222, + "loss": 0.7718, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.38606317904676285, + "learning_rate": 0.00014864712674321734, + "loss": 0.707, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.44871910710492474, + "learning_rate": 0.00014819353798236427, + "loss": 0.802, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.42275304153937604, + "learning_rate": 0.00014773865400511272, + "loss": 0.7547, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.5053291473297197, + "learning_rate": 0.00014728248703661182, + "loss": 0.8012, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.49099852316863113, + "learning_rate": 0.00014682504933649144, + "loss": 0.7517, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5178109830426583, + "learning_rate": 0.00014636635319853275, + "loss": 0.8004, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.4736718415067566, + "learning_rate": 0.00014590641095033787, + "loss": 0.7315, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.36026009543390325, + "learning_rate": 0.00014544523495299842, + "loss": 0.7507, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4535422092166841, + "learning_rate": 0.0001449828376007636, + "loss": 0.7609, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.4470489385174667, + "learning_rate": 0.0001445192313207067, + "loss": 0.7381, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5304272092082939, + "learning_rate": 0.0001440544285723915, + "loss": 0.8047, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.4103397249611091, + "learning_rate": 0.00014358844184753712, + "loss": 0.7025, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4282875131696741, + "learning_rate": 0.00014312128366968243, + "loss": 0.7825, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.3959591611639748, + "learning_rate": 0.00014265296659384956, + "loss": 0.7256, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4297625468076226, + "learning_rate": 0.00014218350320620624, + "loss": 0.686, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.4338317125135978, + "learning_rate": 0.0001417129061237278, + "loss": 0.7852, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.5220480793777401, + "learning_rate": 0.00014124118799385796, + "loss": 0.8364, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4718410962391697, + "learning_rate": 0.00014076836149416887, + "loss": 0.8609, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4425877448351768, + "learning_rate": 0.0001402944393320206, + "loss": 0.7453, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.46482627683884487, + "learning_rate": 0.00013981943424421932, + "loss": 0.7669, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.45474128814187065, + "learning_rate": 0.00013934335899667527, + "loss": 0.8225, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.489351688159818, + "learning_rate": 0.00013886622638405952, + "loss": 0.7513, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4376194429250256, + "learning_rate": 0.00013838804922946027, + "loss": 0.72, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.41802244933852195, + "learning_rate": 0.00013790884038403795, + "loss": 0.7178, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.409712330038051, + "learning_rate": 0.00013742861272668012, + "loss": 0.7023, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.5079825393723467, + "learning_rate": 0.00013694737916365517, + "loss": 0.8606, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.5206399287075734, + "learning_rate": 0.00013646515262826552, + "loss": 0.8485, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.7223531563041334, + "learning_rate": 0.0001359819460805001, + "loss": 0.8535, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.46611260705327295, + "learning_rate": 0.0001354977725066859, + "loss": 0.7691, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.47362875116167796, + "learning_rate": 0.00013501264491913906, + "loss": 0.802, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4389329527007687, + "learning_rate": 0.0001345265763558152, + "loss": 0.8546, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.42871349078083126, + "learning_rate": 0.00013403957987995882, + "loss": 0.7478, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4720877579208673, + "learning_rate": 0.0001335516685797525, + "loss": 0.8079, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4959446137649537, + "learning_rate": 0.00013306285556796495, + "loss": 0.8681, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4123902782366385, + "learning_rate": 0.00013257315398159864, + "loss": 0.6864, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.401872969849858, + "learning_rate": 0.00013208257698153677, + "loss": 0.7413, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.4362674378527309, + "learning_rate": 0.00013159113775218964, + "loss": 0.747, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4428292876454781, + "learning_rate": 0.00013109884950114007, + "loss": 0.7856, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4450863577634821, + "learning_rate": 0.00013060572545878875, + "loss": 0.7419, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4428815143185777, + "learning_rate": 0.00013011177887799845, + "loss": 0.768, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.46916136107051853, + "learning_rate": 0.00012961702303373795, + "loss": 0.746, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.4119649528684462, + "learning_rate": 0.00012912147122272523, + "loss": 0.7287, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4242183391349964, + "learning_rate": 0.00012862513676307008, + "loss": 0.724, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.43575759754640503, + "learning_rate": 0.00012812803299391628, + "loss": 0.7238, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4954243974585828, + "learning_rate": 0.00012763017327508305, + "loss": 0.7593, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.472036317691702, + "learning_rate": 0.0001271315709867059, + "loss": 0.7596, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.4092156773129198, + "learning_rate": 0.00012663223952887723, + "loss": 0.8284, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4588338499843614, + "learning_rate": 0.00012613219232128608, + "loss": 0.6954, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4902923158674284, + "learning_rate": 0.00012563144280285741, + "loss": 0.8042, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3951625455518061, + "learning_rate": 0.00012513000443139112, + "loss": 0.7471, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4398892247631177, + "learning_rate": 0.00012462789068320017, + "loss": 0.773, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.405873315768006, + "learning_rate": 0.00012412511505274844, + "loss": 0.7573, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.44096360410755114, + "learning_rate": 0.00012362169105228826, + "loss": 0.7857, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5224663568877417, + "learning_rate": 0.000123117632211497, + "loss": 0.7902, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.43451134385124535, + "learning_rate": 0.00012261295207711346, + "loss": 0.7854, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.44300948878433516, + "learning_rate": 0.0001221076642125742, + "loss": 0.7104, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.4370012262335751, + "learning_rate": 0.00012160178219764837, + "loss": 0.7029, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.42393845788721735, + "learning_rate": 0.00012109531962807332, + "loss": 0.7269, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3940993083465362, + "learning_rate": 0.00012058829011518896, + "loss": 0.7364, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.5130244197979122, + "learning_rate": 0.00012008070728557186, + "loss": 0.7748, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.38842129810108295, + "learning_rate": 0.00011957258478066931, + "loss": 0.7487, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.407653599404947, + "learning_rate": 0.00011906393625643244, + "loss": 0.7482, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3708263323809625, + "learning_rate": 0.00011855477538294935, + "loss": 0.7218, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4677986528867026, + "learning_rate": 0.00011804511584407763, + "loss": 0.7554, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.42020831082895277, + "learning_rate": 0.00011753497133707679, + "loss": 0.7586, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4339661675175279, + "learning_rate": 0.00011702435557223987, + "loss": 0.7593, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.393783616139176, + "learning_rate": 0.00011651328227252517, + "loss": 0.7852, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.45222289125813675, + "learning_rate": 0.00011600176517318741, + "loss": 0.7711, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.44485794781453797, + "learning_rate": 0.00011548981802140848, + "loss": 0.8108, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.5271931699927285, + "learning_rate": 0.00011497745457592816, + "loss": 0.8133, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4251533047014541, + "learning_rate": 0.00011446468860667421, + "loss": 0.6985, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.4060145386422878, + "learning_rate": 0.00011395153389439233, + "loss": 0.7187, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.47221750761818, + "learning_rate": 0.00011343800423027582, + "loss": 0.7109, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3581608522783374, + "learning_rate": 0.0001129241134155949, + "loss": 0.7512, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.8522901308197447, + "learning_rate": 0.00011240987526132594, + "loss": 0.8318, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4568743543110379, + "learning_rate": 0.00011189530358778005, + "loss": 0.7927, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.4809043240428301, + "learning_rate": 0.00011138041222423177, + "loss": 0.709, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.4959346652066973, + "learning_rate": 0.00011086521500854745, + "loss": 0.7968, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.48271862062812654, + "learning_rate": 0.00011034972578681338, + "loss": 0.8049, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.442702118060553, + "learning_rate": 0.00010983395841296348, + "loss": 0.7725, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5620878781206495, + "learning_rate": 0.00010931792674840718, + "loss": 0.7311, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.39885638849328964, + "learning_rate": 0.00010880164466165674, + "loss": 0.6662, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.484997683733805, + "learning_rate": 0.00010828512602795462, + "loss": 0.7391, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4482024572192048, + "learning_rate": 0.00010776838472890065, + "loss": 0.7423, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4592358505318248, + "learning_rate": 0.00010725143465207867, + "loss": 0.7186, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.404374569147412, + "learning_rate": 0.00010673428969068364, + "loss": 0.7292, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.4756451675893965, + "learning_rate": 0.00010621696374314807, + "loss": 0.7022, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.38598764122770934, + "learning_rate": 0.00010569947071276847, + "loss": 0.7359, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5088816244441849, + "learning_rate": 0.00010518182450733186, + "loss": 0.8187, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.3991736960217553, + "learning_rate": 0.00010466403903874176, + "loss": 0.7517, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5061478823334051, + "learning_rate": 0.00010414612822264455, + "loss": 0.7845, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.38312118917493265, + "learning_rate": 0.00010362810597805526, + "loss": 0.6894, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.40434609793297854, + "learning_rate": 0.0001031099862269837, + "loss": 0.713, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4226916043338718, + "learning_rate": 0.00010259178289406011, + "loss": 0.7646, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.43692635202741587, + "learning_rate": 0.00010207350990616107, + "loss": 0.683, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.5029615187149719, + "learning_rate": 0.0001015551811920351, + "loss": 0.7439, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.4105261731013297, + "learning_rate": 0.00010103681068192845, + "loss": 0.7488, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.3911529679615706, + "learning_rate": 0.00010051841230721065, + "loss": 0.6928, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4807368991319861, + "learning_rate": 0.0001, + "loss": 0.8258, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.5699018479220594, + "learning_rate": 9.948158769278939e-05, + "loss": 0.727, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3524982175685148, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7082, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.4191858175776619, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7889, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5089043426821488, + "learning_rate": 9.792649009383899e-05, + "loss": 0.8518, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.39226352932466774, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6196, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.47792692363151024, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6957, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.44851193673389383, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7451, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.36285820996546586, + "learning_rate": 9.585387177735547e-05, + "loss": 0.677, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.4231831734407228, + "learning_rate": 9.533596096125825e-05, + "loss": 0.8222, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4800096092127521, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6948, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3921582958393825, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6989, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4605387181041602, + "learning_rate": 9.378303625685195e-05, + "loss": 0.801, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.3697875554335195, + "learning_rate": 9.326571030931637e-05, + "loss": 0.698, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.41037288560797924, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7173, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.5513440460409281, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7883, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.47597729596937366, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7454, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.40362975305365, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6796, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.47052048349930625, + "learning_rate": 9.068207325159284e-05, + "loss": 0.8046, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.39305474918369454, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7177, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4365853618850097, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7464, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.46443883518953344, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7721, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.6077144067602148, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7881, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.4559313064847387, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7696, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.48334528459124015, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7902, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.4122577959320843, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7244, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3967435244897451, + "learning_rate": 8.656199576972423e-05, + "loss": 0.754, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.4959668953567126, + "learning_rate": 8.604846610560771e-05, + "loss": 0.8717, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.43450610691260594, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6782, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.49715994893977894, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6976, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4067290311812931, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7294, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.4690739518969345, + "learning_rate": 8.399823482681262e-05, + "loss": 0.7932, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4981800204868436, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7171, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.46035418939471695, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6594, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4645533919344865, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7845, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.37790686444774557, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6723, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5346137138402413, + "learning_rate": 8.144522461705067e-05, + "loss": 0.8126, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5802932651326059, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7267, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.3998975157211213, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7429, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.42551219430876636, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7446, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3915114759014101, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7621, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.42707938100584764, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7594, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4217286541447519, + "learning_rate": 7.839821780235168e-05, + "loss": 0.7066, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.4813579329876303, + "learning_rate": 7.789233578742582e-05, + "loss": 0.8272, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.5209421571253114, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7684, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.38388397495120674, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6926, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.38206811685405234, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7066, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.42079054181453934, + "learning_rate": 7.587488494725157e-05, + "loss": 0.742, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.45638369051473626, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7812, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.45426445383407044, + "learning_rate": 7.48699955686089e-05, + "loss": 0.8086, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.5540822685134705, + "learning_rate": 7.43685571971426e-05, + "loss": 0.8019, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4241661579124998, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7149, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.41545262832705115, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7546, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.47854883595431164, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7275, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5417155836721987, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7819, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.3418374137491104, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6197, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.37827506427354074, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7662, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.37678842749919733, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6813, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.38552606554567614, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6521, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.39221566211771824, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6736, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5814535406006868, + "learning_rate": 6.939427454121128e-05, + "loss": 0.9052, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4316499224413905, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7434, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.42320188391812447, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6543, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.5127523870382765, + "learning_rate": 6.791742301846326e-05, + "loss": 0.8075, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.37951728641973953, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7253, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.3968770913229026, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7194, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3788579760423852, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7089, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.4327607445645209, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7981, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.40926501415323624, + "learning_rate": 6.547342364418481e-05, + "loss": 0.8147, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.36710894474207795, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7413, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.37772753596676223, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7627, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4103176842308724, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7856, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4787503849919845, + "learning_rate": 6.35348473717345e-05, + "loss": 0.8353, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.5057142005966992, + "learning_rate": 6.305262083634488e-05, + "loss": 0.893, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.42615384142110324, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7218, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.38574767132403925, + "learning_rate": 6.209115961596208e-05, + "loss": 0.74, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4207612858058705, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7154, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.39842148956461876, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6989, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.4004093364012239, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6954, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.40440694527606397, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7167, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4302673835565511, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7042, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.39755984527488586, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6723, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4766648621407094, + "learning_rate": 5.875881200614207e-05, + "loss": 0.7455, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.414554232063145, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6784, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.43440251704921684, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7554, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.4205705236293427, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7643, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5240881892346563, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7497, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.41002431801695394, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6746, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.39079544638877645, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7334, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.46154129162530355, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7821, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.43857763101329666, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6967, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4745556878551065, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.721, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.43407976222305766, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7525, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.4377749247872826, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7309, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.36927751303735046, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7268, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3715247184775538, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7297, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.42401079947649123, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6671, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.46140173636102044, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7596, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.4120529712114191, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6686, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.46392851458326273, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7256, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.41595596761697523, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.692, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.5004337449226676, + "learning_rate": 5.000000000000002e-05, + "loss": 0.801, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.35561942584168765, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6882, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.43571514552457885, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6887, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.48997334399222864, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7369, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.48227674033755163, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7833, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3959685747725765, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7303, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.694068235573397, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7366, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.49007966016681326, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7407, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.42312320284128413, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7419, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.534244873446867, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.8048, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3618785576090114, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7265, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.45673110894257185, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6546, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.4761630111237167, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7687, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.38102982193869156, + "learning_rate": 4.428143953045717e-05, + "loss": 0.661, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.6090794065883615, + "learning_rate": 4.385170490729712e-05, + "loss": 0.872, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.8268384433744854, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6434, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.42124555231335026, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.8004, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.4366646958594062, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7084, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.40981492026915917, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7563, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3963709876378491, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6875, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4350406188024903, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6756, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4856520578662212, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7326, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.444036509677123, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7455, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.44396341098705794, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6529, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3636036284538335, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7295, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5035071754980905, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6848, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3752917947702577, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6623, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.4664576764813504, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7635, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3555481752677377, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6789, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.38442265931936404, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6852, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.41769954474382226, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.766, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3938081186676616, + "learning_rate": 3.678635720256737e-05, + "loss": 0.71, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.7310037019848448, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6511, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.46401448267889905, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.8104, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4204022615858649, + "learning_rate": 3.558895885496023e-05, + "loss": 0.734, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.6267879420567184, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6532, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4369141796964484, + "learning_rate": 3.479933074573858e-05, + "loss": 0.713, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.42811342285199183, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7457, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4308026861336839, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7334, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4067378877022371, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7069, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.44304826502818534, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7197, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.4028710850374116, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6856, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.4464052817108853, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7472, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.48050418959351304, + "learning_rate": 3.209137931341143e-05, + "loss": 0.7196, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.37982366825403496, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6638, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.44567261953196763, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.73, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.5719709727175741, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7953, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.504208070255483, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6936, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.45824944366726106, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7434, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.44150429031169014, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7289, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4566032504648868, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6659, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.42578567358612823, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6234, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.467928502515594, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7224, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.45007822419266574, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6943, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4108721661580886, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.7003, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4681752861870969, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.673, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.4080367829644397, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7153, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.4133472524221295, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6626, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.46763846734027786, + "learning_rate": 2.659414712405398e-05, + "loss": 0.663, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.5539047759878163, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7581, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.38354130612880843, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6638, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4199217200454841, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.744, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.42056017357147596, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7472, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.369375746532977, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6312, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.4931786683894933, + "learning_rate": 2.451770608467432e-05, + "loss": 0.7021, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.47565217238459334, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6572, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.47582521206217604, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7609, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.4241576219472043, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7538, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5125539953983531, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7831, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3492899730093599, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6928, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3876442697873921, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6825, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3974699605820346, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6964, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.4629654109049974, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7326, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.5108010894457006, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6617, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.45041661415736134, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6491, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.41031905128027335, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7253, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.463599990726639, + "learning_rate": 2.058583491552465e-05, + "loss": 0.7482, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.44554078726064866, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6834, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.5519141136342646, + "learning_rate": 1.995999968955641e-05, + "loss": 0.8325, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3867379104449113, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7434, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.46304328790007515, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7524, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4644753053921928, + "learning_rate": 1.903740076395151e-05, + "loss": 0.8113, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.46405361531317724, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7376, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.47290916009529965, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7238, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.38646347584691115, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6916, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4688049279162531, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6537, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.40977830334252874, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6816, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.41665687032634674, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7616, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.4449277242678797, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7309, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3862853262939256, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6341, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.417870667519656, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6424, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4121557958551208, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6814, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.42721070624989443, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6832, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.40377802251968753, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6557, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.3639893684851359, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6833, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5400234489168635, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7916, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.49982839638910553, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7992, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.49879430542119474, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.668, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.4329818300875685, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6549, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5055663085996884, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7629, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.45750246094611186, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6691, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4497702082009352, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7286, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.38766683544166136, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6497, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.41779772342848726, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6854, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.5014496809811316, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7739, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3817297222875174, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7148, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.45123970026378324, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6932, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.46221502063723513, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6627, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4967064651434326, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7581, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4087309191651596, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6832, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.4018611215265461, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.7426, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.4238146860197108, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7402, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.45870761560589585, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.7407, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.4243634380902265, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6786, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.40647965811740283, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7112, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5011817292459243, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7392, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4060613600018548, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6708, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.37370989838982127, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6711, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.46309490981922485, + "learning_rate": 9.332739882292752e-06, + "loss": 0.762, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3494927600003019, + "learning_rate": 9.115273765538202e-06, + "loss": 0.627, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3818463473849368, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6823, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.41204709750745633, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6753, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.4223149820814058, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7162, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4287433314251421, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6805, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.42808426536818117, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7022, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.40190051662355125, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6966, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.4371617503095811, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7524, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.4287837509396157, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6495, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.37595505918327743, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6713, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3987376962211029, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5807, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.43144229370725906, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6696, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.36384872319589756, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6645, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.4038613126225733, + "learning_rate": 6.512524116523633e-06, + "loss": 0.7311, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4283741889682376, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6764, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.4612154444409386, + "learning_rate": 6.149504395842087e-06, + "loss": 0.8144, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.41363644715873055, + "learning_rate": 5.971775505458444e-06, + "loss": 0.7091, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.42739581915469604, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.73, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.3669064171593529, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6378, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.6607804454509449, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6361, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3642953445748799, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6676, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4787090231117326, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7741, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.35372153768638664, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6657, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.40850515430804185, + "learning_rate": 4.798689246727006e-06, + "loss": 0.682, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.47580526864703004, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6537, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.43774834156062925, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6584, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4465029012018035, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.8118, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.40382894066508757, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7346, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.4004050705875985, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6783, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.46148850023139054, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6339, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4142316676102528, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7225, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4259084891522085, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6397, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.42425539806284446, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6534, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.4564821915530866, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6699, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4327125181675075, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7108, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.46541524052046224, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6956, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.4303975211543722, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7029, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4677034577629268, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.682, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.41970920358434805, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7246, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.44325982926714413, + "learning_rate": 2.590275647868867e-06, + "loss": 0.663, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.543833318950384, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7259, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4462626991617039, + "learning_rate": 2.3610579436393e-06, + "loss": 0.673, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.47848876324986994, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7009, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.4330236099546544, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6902, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.42637120113247723, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6401, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.3973135444199052, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6778, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.6770882959135993, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.7072, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.4007464012163911, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6781, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.43284323516860274, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.7363, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.49863862119221625, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.7979, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.5242826580870628, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7447, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.5720376546393448, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7279, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.39947577817568036, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6182, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.4661137521776312, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.7281, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.37893881694450166, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6584, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4611650389520086, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6902, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.387985776119187, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6598, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.48483349583891894, + "learning_rate": 9.070131527609604e-07, + "loss": 0.7276, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.4412989632525287, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6627, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.41484413216610105, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6308, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4690159050347405, + "learning_rate": 7.100118211581852e-07, + "loss": 0.642, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.4245437890533855, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7825, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3831070802690724, + "learning_rate": 5.920169059947411e-07, + "loss": 0.666, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.41107280846289773, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6961, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.6288518414789465, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7746, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.41613371819591594, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6798, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4064966673354674, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.69, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.468932838961255, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7338, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.5218631986752835, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.7963, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3781245114300269, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6981, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.41240953882994247, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6917, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.43377123595954153, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6831, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.472989623336461, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.7305, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.3338845198810002, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6196, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.43989366995203444, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6993, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.5190236187058785, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6933, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.5606180991324131, + "learning_rate": 6.583743778106887e-08, + "loss": 0.7477, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.41189275650330803, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7116, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.3816408908934506, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6426, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.43072600762165386, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6697, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4098582536733861, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6662, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4947135664257666, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7612, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.5299959699798045, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.8105, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.46021437422339323, + "learning_rate": 0.0, + "loss": 0.7593, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 543415710547968.0, + "train_loss": 0.7692152653694153, + "train_runtime": 9780.5025, + "train_samples_per_second": 1.022, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 543415710547968.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0766b2d281496fcf3cff71bd3176c25625b3073e --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "up_proj", + "o_proj", + "gate_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4f2ad89cb9ad3b1ffdd0fd767694f3939c7da52b --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2aaac0136bc40e13ef336040fb4c6ac3a9c34aa7bdb2a455fa6188347d62fcd +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c22fb562e3f3931dbbb04339c145ba33b13141f --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f417253e2c0e04301233f7852cd6cd14dd424032956d69b6dd2a3ff9757fbbc8 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6f50a3a2408f37c1998134245e2f35c15a4625 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9163293866966448, + "learning_rate": 2e-05, + "loss": 1.4166, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9209812937726838, + "learning_rate": 4e-05, + "loss": 1.4921, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7513419496799603, + "learning_rate": 6e-05, + "loss": 1.3631, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.754331327849121, + "learning_rate": 8e-05, + "loss": 1.3313, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.8161092277218196, + "learning_rate": 0.0001, + "loss": 1.1536, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7970259355167723, + "learning_rate": 0.00012, + "loss": 1.0271, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8143357287470446, + "learning_rate": 0.00014, + "loss": 1.0623, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7430977217863324, + "learning_rate": 0.00016, + "loss": 0.9672, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5186908227716984, + "learning_rate": 0.00018, + "loss": 0.9226, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.42489469182136214, + "learning_rate": 0.0002, + "loss": 0.9687, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5091331543962424, + "learning_rate": 0.00019999458931878073, + "loss": 0.9466, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.463799838358979, + "learning_rate": 0.0001999783578606323, + "loss": 0.9561, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.43868713681710897, + "learning_rate": 0.00019995130738201966, + "loss": 0.9048, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5255668414303949, + "learning_rate": 0.0001999134408101731, + "loss": 0.9016, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.4617659004309836, + "learning_rate": 0.00019986476224277165, + "loss": 0.9447, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4896592600229671, + "learning_rate": 0.00019980527694749952, + "loss": 0.9412, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4416036841873187, + "learning_rate": 0.00019973499136147606, + "loss": 0.8995, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.37320680671820966, + "learning_rate": 0.0001996539130905593, + "loss": 0.896, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3781335739253041, + "learning_rate": 0.0001995620509085228, + "loss": 0.8611, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.39217880082155243, + "learning_rate": 0.00019945941475610623, + "loss": 0.9179, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.3875066650438494, + "learning_rate": 0.0001993460157399396, + "loss": 0.8952, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.46226087229767915, + "learning_rate": 0.0001992218661313415, + "loss": 0.8898, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.38249225776490886, + "learning_rate": 0.00019908697936499103, + "loss": 0.8179, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.42886624175871285, + "learning_rate": 0.00019894137003747403, + "loss": 0.8406, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.4533228894108049, + "learning_rate": 0.00019878505390570362, + "loss": 0.9388, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.33852756379078713, + "learning_rate": 0.00019861804788521493, + "loss": 0.8298, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.3778536242602346, + "learning_rate": 0.00019844037004833473, + "loss": 0.8718, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.35953660416723576, + "learning_rate": 0.00019825203962222572, + "loss": 0.8864, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.3143937264114336, + "learning_rate": 0.0001980530769868059, + "loss": 0.8184, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.3788967311933849, + "learning_rate": 0.00019784350367254322, + "loss": 0.8882, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.36213335832310506, + "learning_rate": 0.0001976233423581255, + "loss": 0.8855, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.37933788919137773, + "learning_rate": 0.0001973926168680066, + "loss": 0.8466, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.3546289025572818, + "learning_rate": 0.00019715135216982798, + "loss": 0.8183, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.37374088665594296, + "learning_rate": 0.0001968995743717171, + "loss": 0.8501, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.4253406504416569, + "learning_rate": 0.00019663731071946206, + "loss": 0.9169, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.40895647289121645, + "learning_rate": 0.00019636458959356316, + "loss": 0.879, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3165715042361572, + "learning_rate": 0.0001960814405061619, + "loss": 0.8024, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.32960741200530574, + "learning_rate": 0.00019578789409784727, + "loss": 0.8001, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.35716428328685657, + "learning_rate": 0.00019548398213434007, + "loss": 0.7774, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.3485123728123483, + "learning_rate": 0.00019516973750305532, + "loss": 0.7808, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.34264025569520523, + "learning_rate": 0.00019484519420954354, + "loss": 0.7901, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.3338318387200505, + "learning_rate": 0.00019451038737381077, + "loss": 0.8176, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.30934157025197984, + "learning_rate": 0.00019416535322651818, + "loss": 0.7435, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.36528602365420015, + "learning_rate": 0.00019381012910506146, + "loss": 0.8768, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.33795181946399744, + "learning_rate": 0.00019344475344953012, + "loss": 0.7496, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.31841058178664267, + "learning_rate": 0.00019306926579854821, + "loss": 0.8061, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3588333916772071, + "learning_rate": 0.00019268370678499533, + "loss": 0.7551, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.32979363052773386, + "learning_rate": 0.0001922881181316097, + "loss": 0.7937, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.3663299223535037, + "learning_rate": 0.00019188254264647337, + "loss": 0.8081, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.365420266603761, + "learning_rate": 0.0001914670242183795, + "loss": 0.8451, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.36331027934440596, + "learning_rate": 0.0001910416078120832, + "loss": 0.8703, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.33315022571857567, + "learning_rate": 0.0001906063394634356, + "loss": 0.7579, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3701346588086481, + "learning_rate": 0.00019016126627440237, + "loss": 0.8121, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.33621803428395475, + "learning_rate": 0.00018970643640796642, + "loss": 0.8605, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.3270542438452731, + "learning_rate": 0.000189241899082916, + "loss": 0.7837, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3276369008956474, + "learning_rate": 0.00018876770456851877, + "loss": 0.7942, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.34788580077673004, + "learning_rate": 0.0001882839041790818, + "loss": 0.8199, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3422893547509533, + "learning_rate": 0.00018779055026839868, + "loss": 0.7744, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.38231898220230265, + "learning_rate": 0.00018728769622408423, + "loss": 0.7988, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.32894744574213003, + "learning_rate": 0.00018677539646179707, + "loss": 0.8069, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3238013073265371, + "learning_rate": 0.00018625370641935129, + "loss": 0.7645, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3642101864143703, + "learning_rate": 0.00018572268255071718, + "loss": 0.8245, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.34996224525739966, + "learning_rate": 0.00018518238231991218, + "loss": 0.8031, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3792361629658678, + "learning_rate": 0.00018463286419478255, + "loss": 0.8767, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.3380215318560921, + "learning_rate": 0.00018407418764067627, + "loss": 0.825, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3240107838253595, + "learning_rate": 0.00018350641311400812, + "loss": 0.7525, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.3641652055005377, + "learning_rate": 0.0001829296020557174, + "loss": 0.8785, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.32116798048624046, + "learning_rate": 0.00018234381688461942, + "loss": 0.7986, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.350351230087757, + "learning_rate": 0.0001817491209906506, + "loss": 0.8094, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.3343616667111424, + "learning_rate": 0.00018114557872800905, + "loss": 0.8007, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.34400861184293763, + "learning_rate": 0.00018053325540819045, + "loss": 0.8058, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3217629099202676, + "learning_rate": 0.0001799122172929206, + "loss": 0.8036, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3448799506680468, + "learning_rate": 0.00017928253158698473, + "loss": 0.8184, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.33561964969079927, + "learning_rate": 0.0001786442664309554, + "loss": 0.7817, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.3392407704651979, + "learning_rate": 0.0001779974908938184, + "loss": 0.7715, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.29657870397100855, + "learning_rate": 0.0001773422749654988, + "loss": 0.709, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.33813418600024964, + "learning_rate": 0.00017667868954928694, + "loss": 0.7871, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3416826898530567, + "learning_rate": 0.00017600680645416583, + "loss": 0.7969, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.2981910538689299, + "learning_rate": 0.00017532669838704035, + "loss": 0.7411, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.4029774190488523, + "learning_rate": 0.00017463843894486937, + "loss": 0.7657, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3688934607028786, + "learning_rate": 0.0001739421026067017, + "loss": 0.8689, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.33197282771751185, + "learning_rate": 0.00017323776472561627, + "loss": 0.8135, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3328457603624298, + "learning_rate": 0.00017252550152056795, + "loss": 0.7945, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.3597081373185118, + "learning_rate": 0.0001718053900681397, + "loss": 0.8227, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.3624062974101868, + "learning_rate": 0.00017107750829420176, + "loss": 0.8193, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.319880066064466, + "learning_rate": 0.00017034193496547902, + "loss": 0.7496, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.36856807057057184, + "learning_rate": 0.00016959874968102735, + "loss": 0.7915, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.38038518435759694, + "learning_rate": 0.00016884803286362, + "loss": 0.7681, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.34489430198243376, + "learning_rate": 0.00016808986575104465, + "loss": 0.7606, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.3260103390407813, + "learning_rate": 0.00016732433038731242, + "loss": 0.8265, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.3654157265599615, + "learning_rate": 0.0001665515096137797, + "loss": 0.7735, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.33510436709742036, + "learning_rate": 0.00016577148706018328, + "loss": 0.7758, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3373925001019564, + "learning_rate": 0.00016498434713559088, + "loss": 0.7878, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.30010836206908226, + "learning_rate": 0.00016419017501926656, + "loss": 0.7726, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.38889888718744725, + "learning_rate": 0.0001633890566514535, + "loss": 0.7793, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3711177679722351, + "learning_rate": 0.00016258107872407375, + "loss": 0.779, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3425210907474298, + "learning_rate": 0.0001617663286713474, + "loss": 0.7696, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.33997751764005274, + "learning_rate": 0.00016094489466033043, + "loss": 0.7488, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.32141948623233824, + "learning_rate": 0.00016011686558137448, + "loss": 0.7392, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.33949663819952836, + "learning_rate": 0.0001592823310385073, + "loss": 0.728, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3262082210447533, + "learning_rate": 0.0001584413813397364, + "loss": 0.7563, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3034623801146321, + "learning_rate": 0.00015759410748727662, + "loss": 0.7026, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3271828380287641, + "learning_rate": 0.00015674060116770236, + "loss": 0.7985, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.29986047797674475, + "learning_rate": 0.00015588095474202595, + "loss": 0.7154, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.35317859176949684, + "learning_rate": 0.00015501526123570277, + "loss": 0.7909, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.32786176013369694, + "learning_rate": 0.00015414361432856475, + "loss": 0.7529, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.30299131489469927, + "learning_rate": 0.0001532661083446829, + "loss": 0.76, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3535296640756614, + "learning_rate": 0.00015238283824216015, + "loss": 0.774, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.40807068025912596, + "learning_rate": 0.00015149389960285558, + "loss": 0.7698, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.37071784614450315, + "learning_rate": 0.00015059938862204127, + "loss": 0.7906, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3275874632790161, + "learning_rate": 0.00014969940209799248, + "loss": 0.7984, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.2914969550625818, + "learning_rate": 0.00014879403742151283, + "loss": 0.7286, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.32725028471181783, + "learning_rate": 0.00014788339256539544, + "loss": 0.769, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3727566498615632, + "learning_rate": 0.0001469675660738206, + "loss": 0.7692, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.3370225841889968, + "learning_rate": 0.00014604665705169237, + "loss": 0.7556, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.2842272226710898, + "learning_rate": 0.00014512076515391375, + "loss": 0.7451, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.36320902592755067, + "learning_rate": 0.00014418999057460276, + "loss": 0.7637, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.30249909231708233, + "learning_rate": 0.0001432544340362501, + "loss": 0.733, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.31992510603267815, + "learning_rate": 0.00014231419677881966, + "loss": 0.6979, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.35107423715677044, + "learning_rate": 0.00014136938054879283, + "loss": 0.7993, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.33625336785734383, + "learning_rate": 0.00014042008758815818, + "loss": 0.7927, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3602866370778261, + "learning_rate": 0.00013946642062334766, + "loss": 0.7855, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3074605944536328, + "learning_rate": 0.00013850848285411994, + "loss": 0.7248, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3057958902745291, + "learning_rate": 0.000137546377942393, + "loss": 0.7032, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.38999508557023527, + "learning_rate": 0.00013658021000102636, + "loss": 0.8462, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3817235639029034, + "learning_rate": 0.00013561008358255468, + "loss": 0.801, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.33494104617840476, + "learning_rate": 0.00013463610366787392, + "loss": 0.8228, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3248689476374051, + "learning_rate": 0.00013365837565488064, + "loss": 0.767, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3317346185237444, + "learning_rate": 0.0001326770053470668, + "loss": 0.7687, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3126217755965711, + "learning_rate": 0.0001316920989420703, + "loss": 0.7345, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3342062499387303, + "learning_rate": 0.00013070376302018287, + "loss": 0.7589, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3295916994383612, + "learning_rate": 0.00012971210453281674, + "loss": 0.748, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3130205270632654, + "learning_rate": 0.000128717230790931, + "loss": 0.7199, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3398177033860641, + "learning_rate": 0.00012771924945341906, + "loss": 0.731, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.3155687808886285, + "learning_rate": 0.00012671826851545851, + "loss": 0.7867, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.34238595877766037, + "learning_rate": 0.0001257143962968246, + "loss": 0.7422, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.30576992227691874, + "learning_rate": 0.00012470774143016853, + "loss": 0.7544, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3076010730264354, + "learning_rate": 0.00012369841284926188, + "loss": 0.7642, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.38408408830985924, + "learning_rate": 0.00012268651977720866, + "loss": 0.7807, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.3049755918909417, + "learning_rate": 0.00012167217171462566, + "loss": 0.6989, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.292438543795018, + "learning_rate": 0.0001206554784277931, + "loss": 0.7232, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.33263754585380034, + "learning_rate": 0.00011963654993677645, + "loss": 0.7598, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.28143421334423013, + "learning_rate": 0.00011861549650352069, + "loss": 0.7277, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.30315453345237825, + "learning_rate": 0.00011759242861991855, + "loss": 0.7473, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.30219790685112, + "learning_rate": 0.00011656745699585371, + "loss": 0.7636, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.322073440323831, + "learning_rate": 0.00011554069254722051, + "loss": 0.7898, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.37521946130708883, + "learning_rate": 0.00011451224638392129, + "loss": 0.7455, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3256047252143069, + "learning_rate": 0.00011348222979784289, + "loss": 0.7092, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4779417757830283, + "learning_rate": 0.00011245075425081328, + "loss": 0.7832, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.3596860434953128, + "learning_rate": 0.00011141793136253986, + "loss": 0.7472, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.32962255387490563, + "learning_rate": 0.0001103838728985307, + "loss": 0.8, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3611211399160521, + "learning_rate": 0.000109348690758, + "loss": 0.7491, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3192671808382749, + "learning_rate": 0.00010831249696175918, + "loss": 0.695, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.31949935931855866, + "learning_rate": 0.0001072754036400944, + "loss": 0.7193, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.31110700730922364, + "learning_rate": 0.00010623752302063283, + "loss": 0.7104, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.41730556147696163, + "learning_rate": 0.00010519896741619803, + "loss": 0.7666, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3357401697744798, + "learning_rate": 0.00010415984921265609, + "loss": 0.764, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.2924060557957266, + "learning_rate": 0.00010312028085675391, + "loss": 0.6929, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3193691092922649, + "learning_rate": 0.00010208037484395114, + "loss": 0.7175, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.30621879833675425, + "learning_rate": 0.00010104024370624644, + "loss": 0.7382, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.32699795272360216, + "learning_rate": 0.0001, + "loss": 0.7488, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2987335478288324, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7109, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3392541280977018, + "learning_rate": 9.791962515604887e-05, + "loss": 0.813, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.36226136393700703, + "learning_rate": 9.687971914324607e-05, + "loss": 0.6547, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.3002445644070877, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7063, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3299914054053417, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7538, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3096490280262253, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7447, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3410705854093673, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7041, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3507201924983843, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7612, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.31780260642477853, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7358, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.29922638948580016, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7304, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.39868752013565906, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7745, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.34171184725576925, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7764, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.2987398982694719, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7352, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.3286939484464021, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7682, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3422171703812053, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7081, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.5352184148808062, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7529, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.36553182360962533, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7203, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3725899918477243, + "learning_rate": 8.138450349647936e-05, + "loss": 0.739, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.36281213507325893, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7357, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.2972337348051776, + "learning_rate": 7.934452157220694e-05, + "loss": 0.7466, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.30521793330329594, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7264, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3408082485075347, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7924, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.27453729413856376, + "learning_rate": 7.630158715073813e-05, + "loss": 0.6938, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.322681402186739, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7584, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3414611441702101, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8018, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.31233590764343705, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7349, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.37213900764320684, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7521, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.2590892828912147, + "learning_rate": 7.1282769209069e-05, + "loss": 0.6945, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.2781516718298695, + "learning_rate": 7.028789546718326e-05, + "loss": 0.6655, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.34817878862191254, + "learning_rate": 6.929623697981718e-05, + "loss": 0.786, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3138559242318282, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6972, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3271741493104444, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7641, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.28973751174107104, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7129, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.33571550673664213, + "learning_rate": 6.536389633212609e-05, + "loss": 0.8082, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.26899932541943944, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7517, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3329509997257, + "learning_rate": 6.341978999897365e-05, + "loss": 0.8128, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3609394516840636, + "learning_rate": 6.245362205760704e-05, + "loss": 0.8096, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.29276928147697384, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7309, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.28767354411045265, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7004, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3088729628061986, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7131, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3357986952575956, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7105, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.31206066419022793, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7159, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3868313645901953, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7563, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.29313931261667653, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.7047, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.34241164911628297, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7462, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.35188915481458427, + "learning_rate": 5.395334294830765e-05, + "loss": 0.7409, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.2932460158152136, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7333, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.29631995985705306, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7014, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.31510080269240404, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7171, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3295408127883741, + "learning_rate": 5.030059790200756e-05, + "loss": 0.706, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.31233092467065754, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7453, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.35669248566869416, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7156, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3155211412518988, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7567, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3673810003938718, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7397, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3384960762159778, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7745, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3109655404121361, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6912, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3222530441680068, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7148, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.40222613798662027, + "learning_rate": 4.325939883229766e-05, + "loss": 0.758, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.31264219212955136, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7595, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.37754809999159067, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7217, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.32156745646521057, + "learning_rate": 4.071766896149273e-05, + "loss": 0.707, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3142160541974962, + "learning_rate": 3.988313441862553e-05, + "loss": 0.7026, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3295178426188756, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7076, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.30884944419028393, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7142, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.2679648303037726, + "learning_rate": 3.741892127592625e-05, + "loss": 0.6856, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.2966145053474197, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7432, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3123776672696557, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7356, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.31681605079604264, + "learning_rate": 3.501565286440914e-05, + "loss": 0.6957, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.30323256782096997, + "learning_rate": 3.422851293981676e-05, + "loss": 0.731, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.30683641302921955, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7196, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3099465622888912, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7059, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3394631523163047, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7381, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.2956908287206056, + "learning_rate": 3.115196713638e-05, + "loss": 0.6998, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.39008891875701335, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7491, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3274417615767872, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7393, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.32468926558137884, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6482, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.33865733040761303, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7172, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3239366885437433, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.6902, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.30733315996560057, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.6929, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3769329156429099, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7152, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3273555773412006, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7099, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.2931808275573847, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.692, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3752786184487787, + "learning_rate": 2.399319354583418e-05, + "loss": 0.6858, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.32213365090116863, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7617, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3326490848140708, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7433, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.30800910739286863, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.6919, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.49114707854361583, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7006, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3082444913571276, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.6905, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.3185600456869451, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7205, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.36133186034657844, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7925, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.34594125770304723, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7896, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.338131887400637, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7372, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.32059408039911114, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6755, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.2903927824066888, + "learning_rate": 1.707039794428259e-05, + "loss": 0.7277, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.30891151567079217, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6868, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.30717699046560654, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6682, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.30079449574608474, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.6718, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3709182222658317, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7455, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.3480906038452339, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7436, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.32079050007375537, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7135, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3820019792030307, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7058, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.46781284966016706, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.6724, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.322646476532804, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7516, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.32793966242976536, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6871, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.32915100964470706, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7256, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3078508862353911, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7451, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3153781972650277, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.716, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3529676795978728, + "learning_rate": 9.838733725597615e-06, + "loss": 0.73, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.28845262326412136, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6776, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3480016710788754, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7023, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.2996303480950192, + "learning_rate": 8.532975781620512e-06, + "loss": 0.6844, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3183815751311741, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7043, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3016624458410541, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7089, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.3167064263070341, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.709, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.2915655775574416, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6358, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.30560329801302655, + "learning_rate": 6.555246550469907e-06, + "loss": 0.6783, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3006983164274014, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7104, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.32381519416112386, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7695, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.28976807968257356, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6894, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.30162867207666694, + "learning_rate": 5.154805790456485e-06, + "loss": 0.6573, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.31635142898683855, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7277, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.36265604881660407, + "learning_rate": 4.516017865659949e-06, + "loss": 0.6736, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.32074207308074953, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7407, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.2883460722478291, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7147, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3307282963021205, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.685, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.30251432198462375, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.6537, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.302807765576497, + "learning_rate": 3.100425628282899e-06, + "loss": 0.6969, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.32237386815312, + "learning_rate": 2.848647830172024e-06, + "loss": 0.7084, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.37620709533236935, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7136, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3688407096189695, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7006, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.33157890719191463, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6921, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.31827075119431947, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6691, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.2970866545880075, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.6984, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.3010996146445767, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7151, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3619267851120905, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7775, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3506925860508666, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6799, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3098875126472622, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7015, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.32082064744176364, + "learning_rate": 9.130206350089765e-07, + "loss": 0.6796, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.31943579966438246, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7002, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.33985923602030726, + "learning_rate": 6.539842600603918e-07, + "loss": 0.6423, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.2892635952892666, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7323, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.37923359172715465, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7404, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3103628292548366, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6936, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.34823523050987065, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7705, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3020439440237327, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7011, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.5310387928312621, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7165, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.28494078878406287, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6638, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.37140395118247604, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7276, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.29302849163530503, + "learning_rate": 2.164213936770576e-08, + "loss": 0.6829, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.2949229958469089, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6725, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.37943794632171773, + "learning_rate": 0.0, + "loss": 0.7934, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 786873517342720.0, + "train_loss": 0.7703683385864283, + "train_runtime": 9556.9637, + "train_samples_per_second": 1.046, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 786873517342720.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dd2e3c9bac94eb7f24dc48c7b39242778281e781 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "gate_proj", + "down_proj", + "v_proj", + "k_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..26ac00add58f331d7a6a8c9898692e65e2143de2 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:325eff7774f190b073e51cfe5f3e4b738c697abd6552ab41d94f592754e122f8 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a466207d4f2ebc68fcc00da3e4a30375d36bb48 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:288f02279fef438d6c35b4405dc74883532028e256f714839af57fe7276025e6 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c9f9d20b13387f57c1b60120e64f43cea86c9cf --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 1.0085854697429473, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4793, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9972559026650626, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3539, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.892014568991878, + "learning_rate": 3.157894736842105e-05, + "loss": 1.4286, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9307992964341537, + "learning_rate": 4.210526315789474e-05, + "loss": 1.4656, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.7602254990544522, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.208, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.9039945441243217, + "learning_rate": 6.31578947368421e-05, + "loss": 1.2587, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.9450987941401989, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1507, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.4738996367046087, + "learning_rate": 8.421052631578948e-05, + "loss": 1.134, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.8769935738644637, + "learning_rate": 9.473684210526316e-05, + "loss": 1.0135, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.770842931812109, + "learning_rate": 0.00010526315789473685, + "loss": 1.0147, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6435483023409994, + "learning_rate": 0.00011578947368421053, + "loss": 0.9206, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5638381409433151, + "learning_rate": 0.0001263157894736842, + "loss": 0.9648, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6361226951053639, + "learning_rate": 0.0001368421052631579, + "loss": 0.9395, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6113438676456088, + "learning_rate": 0.00014736842105263158, + "loss": 0.9943, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.5433136850170159, + "learning_rate": 0.00015789473684210527, + "loss": 0.9305, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6196928495947501, + "learning_rate": 0.00016842105263157895, + "loss": 0.8827, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5560559689651063, + "learning_rate": 0.00017894736842105264, + "loss": 0.8621, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5674823412664283, + "learning_rate": 0.00018947368421052632, + "loss": 0.9053, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.4694725740359349, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.6006019984416087, + "learning_rate": 0.00019999865623437013, + "loss": 0.9853, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5621161215924341, + "learning_rate": 0.00019999462497359466, + "loss": 0.9147, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5365689431483717, + "learning_rate": 0.00019998790632601496, + "loss": 0.9093, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5172863622066685, + "learning_rate": 0.0001999785004721968, + "loss": 0.923, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.6347044714087057, + "learning_rate": 0.00019996640766492543, + "loss": 0.9312, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.5147510606941328, + "learning_rate": 0.00019995162822919883, + "loss": 0.8204, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.6953390865660324, + "learning_rate": 0.00019993416256221895, + "loss": 0.9291, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5346997972592501, + "learning_rate": 0.00019991401113338104, + "loss": 0.8496, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5887298482561468, + "learning_rate": 0.00019989117448426108, + "loss": 0.8733, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5744288001668765, + "learning_rate": 0.00019986565322860115, + "loss": 0.9139, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.5817644265642208, + "learning_rate": 0.00019983744805229296, + "loss": 0.9084, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5170973145542248, + "learning_rate": 0.00019980655971335945, + "loss": 0.8523, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5704175423395856, + "learning_rate": 0.00019977298904193437, + "loss": 0.9821, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.48982925326254356, + "learning_rate": 0.00019973673694024, + "loss": 0.8225, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5533715409598637, + "learning_rate": 0.00019969780438256293, + "loss": 0.944, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.58521736370168, + "learning_rate": 0.0001996561924152278, + "loss": 0.8852, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4982855683284927, + "learning_rate": 0.0001996119021565693, + "loss": 0.8766, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.5098007683350397, + "learning_rate": 0.0001995649347969019, + "loss": 0.8812, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5125510085738779, + "learning_rate": 0.00019951529159848805, + "loss": 0.8148, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.5086907760916723, + "learning_rate": 0.00019946297389550433, + "loss": 0.9317, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.5425362095412308, + "learning_rate": 0.00019940798309400526, + "loss": 0.8609, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.6110549252989314, + "learning_rate": 0.0001993503206718859, + "loss": 0.9693, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4506531338293441, + "learning_rate": 0.00019928998817884182, + "loss": 0.7965, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5972484794514582, + "learning_rate": 0.00019922698723632767, + "loss": 0.9399, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6574173123923841, + "learning_rate": 0.00019916131953751342, + "loss": 0.8266, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.5890215740181415, + "learning_rate": 0.00019909298684723904, + "loss": 0.8091, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4660388302518513, + "learning_rate": 0.00019902199100196697, + "loss": 0.8218, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.6497393240417777, + "learning_rate": 0.00019894833390973266, + "loss": 0.8786, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5292049724664672, + "learning_rate": 0.00019887201755009357, + "loss": 0.7716, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.6314025950524799, + "learning_rate": 0.0001987930439740757, + "loss": 0.969, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.5200569642060622, + "learning_rate": 0.00019871141530411853, + "loss": 0.9039, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5003737674200991, + "learning_rate": 0.0001986271337340182, + "loss": 0.8915, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.42454237909222065, + "learning_rate": 0.00019854020152886814, + "loss": 0.7601, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.44440771011539576, + "learning_rate": 0.0001984506210249986, + "loss": 0.8047, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5465428784613534, + "learning_rate": 0.00019835839462991361, + "loss": 0.9307, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.4840542158590719, + "learning_rate": 0.00019826352482222638, + "loss": 0.9252, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4611462002254872, + "learning_rate": 0.00019816601415159263, + "loss": 0.8354, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.46015103495950843, + "learning_rate": 0.0001980658652386421, + "loss": 0.8425, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4315958592617036, + "learning_rate": 0.00019796308077490817, + "loss": 0.7914, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4981308035119613, + "learning_rate": 0.00019785766352275542, + "loss": 0.901, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.5382472930103323, + "learning_rate": 0.00019774961631530545, + "loss": 0.8608, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.535777393531857, + "learning_rate": 0.00019763894205636072, + "loss": 0.8906, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.46334889541129715, + "learning_rate": 0.00019752564372032657, + "loss": 0.8885, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.5049068018444929, + "learning_rate": 0.00019740972435213115, + "loss": 0.8412, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5453569712273227, + "learning_rate": 0.00019729118706714375, + "loss": 0.8551, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.5033554954544484, + "learning_rate": 0.00019717003505109095, + "loss": 0.8339, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4870927782123071, + "learning_rate": 0.00019704627155997108, + "loss": 0.8106, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5240204968040884, + "learning_rate": 0.00019691989991996663, + "loss": 0.8529, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.48607963333726895, + "learning_rate": 0.0001967909235273549, + "loss": 0.8412, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.4974957815786268, + "learning_rate": 0.00019665934584841682, + "loss": 0.8824, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.5939769044560865, + "learning_rate": 0.00019652517041934356, + "loss": 0.9555, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.47771639209882233, + "learning_rate": 0.00019638840084614182, + "loss": 0.8176, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.6386894662834409, + "learning_rate": 0.00019624904080453655, + "loss": 0.9388, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4576001640778111, + "learning_rate": 0.00019610709403987246, + "loss": 0.7665, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4200124513468208, + "learning_rate": 0.00019596256436701324, + "loss": 0.834, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.4970757026020292, + "learning_rate": 0.000195815455670239, + "loss": 0.8543, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4640803261141436, + "learning_rate": 0.00019566577190314197, + "loss": 0.7556, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.46055917056692386, + "learning_rate": 0.0001955135170885202, + "loss": 0.8154, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.45748233373993097, + "learning_rate": 0.00019535869531826937, + "loss": 0.7386, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5061480509699664, + "learning_rate": 0.00019520131075327298, + "loss": 0.7209, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.5125866939770451, + "learning_rate": 0.00019504136762329047, + "loss": 0.8597, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4834281117651111, + "learning_rate": 0.00019487887022684336, + "loss": 0.765, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.48871433617747, + "learning_rate": 0.00019471382293110003, + "loss": 0.8293, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4533350258950691, + "learning_rate": 0.00019454623017175812, + "loss": 0.7953, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5020657989282823, + "learning_rate": 0.00019437609645292546, + "loss": 0.8578, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.49816197793406886, + "learning_rate": 0.0001942034263469989, + "loss": 0.7567, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.43322873088029873, + "learning_rate": 0.00019402822449454153, + "loss": 0.7421, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.49779652062498836, + "learning_rate": 0.00019385049560415794, + "loss": 0.8806, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5295204130991914, + "learning_rate": 0.00019367024445236754, + "loss": 0.8717, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4990838250346169, + "learning_rate": 0.00019348747588347637, + "loss": 0.7532, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.45254960997056176, + "learning_rate": 0.00019330219480944694, + "loss": 0.7629, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.45750879146448953, + "learning_rate": 0.00019311440620976597, + "loss": 0.7991, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.43373785740403087, + "learning_rate": 0.0001929241151313108, + "loss": 0.8199, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.5067483645268225, + "learning_rate": 0.00019273132668821364, + "loss": 0.7793, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5323777835671849, + "learning_rate": 0.00019253604606172417, + "loss": 0.7334, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.5172495648535365, + "learning_rate": 0.00019233827850007027, + "loss": 0.7907, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.49612967926384, + "learning_rate": 0.00019213802931831696, + "loss": 0.8262, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.45671458373517876, + "learning_rate": 0.00019193530389822363, + "loss": 0.794, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5037946215348362, + "learning_rate": 0.00019173010768809933, + "loss": 0.83, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.5738153638450577, + "learning_rate": 0.0001915224462026563, + "loss": 0.872, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.4265537385053224, + "learning_rate": 0.00019131232502286188, + "loss": 0.8331, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4439066886609966, + "learning_rate": 0.0001910997497957885, + "loss": 0.8204, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5125766235279255, + "learning_rate": 0.00019088472623446183, + "loss": 0.9437, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.4027517341407651, + "learning_rate": 0.00019066726011770726, + "loss": 0.7318, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4784749643379989, + "learning_rate": 0.0001904473572899947, + "loss": 0.7918, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.5608708183821162, + "learning_rate": 0.00019022502366128135, + "loss": 0.869, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5216604137548994, + "learning_rate": 0.00019000026520685302, + "loss": 0.7746, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4301735202968478, + "learning_rate": 0.0001897730879671634, + "loss": 0.8609, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5054438035773254, + "learning_rate": 0.00018954349804767184, + "loss": 0.8819, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4870990362057011, + "learning_rate": 0.00018931150161867916, + "loss": 0.7721, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.42631773989843086, + "learning_rate": 0.00018907710491516199, + "loss": 0.8154, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.468453256337647, + "learning_rate": 0.0001888403142366049, + "loss": 0.8279, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4050949188886547, + "learning_rate": 0.00018860113594683148, + "loss": 0.7765, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5029661100503304, + "learning_rate": 0.00018835957647383303, + "loss": 0.848, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4620683173397088, + "learning_rate": 0.00018811564230959588, + "loss": 0.8113, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.4465748334104479, + "learning_rate": 0.00018786934000992688, + "loss": 0.7604, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4827698092867937, + "learning_rate": 0.00018762067619427746, + "loss": 0.8053, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.49998806429255743, + "learning_rate": 0.00018736965754556528, + "loss": 0.7869, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5224572558802736, + "learning_rate": 0.00018711629080999504, + "loss": 0.8282, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.44637335744386086, + "learning_rate": 0.00018686058279687698, + "loss": 0.8276, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.45395325027658845, + "learning_rate": 0.00018660254037844388, + "loss": 0.801, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4668880071503474, + "learning_rate": 0.00018634217048966637, + "loss": 0.7133, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4582223709509232, + "learning_rate": 0.0001860794801280666, + "loss": 0.8445, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5683295114262743, + "learning_rate": 0.0001858144763535302, + "loss": 0.8978, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.41420851133767184, + "learning_rate": 0.0001855471662881164, + "loss": 0.7754, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.48421886671784903, + "learning_rate": 0.00018527755711586678, + "loss": 0.7609, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5469804299166952, + "learning_rate": 0.00018500565608261214, + "loss": 0.8702, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5433538041032796, + "learning_rate": 0.00018473147049577774, + "loss": 0.8688, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5529201646076498, + "learning_rate": 0.00018445500772418697, + "loss": 0.9029, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4503807504561782, + "learning_rate": 0.00018417627519786315, + "loss": 0.8253, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.5328965011735313, + "learning_rate": 0.00018389528040783012, + "loss": 0.8496, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.4954691357865197, + "learning_rate": 0.00018361203090591071, + "loss": 0.7994, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.42442377421361566, + "learning_rate": 0.00018332653430452376, + "loss": 0.7241, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.46628402843496697, + "learning_rate": 0.00018303879827647975, + "loss": 0.9319, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.5646971847258934, + "learning_rate": 0.00018274883055477436, + "loss": 0.8479, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.43213337310707894, + "learning_rate": 0.00018245663893238075, + "loss": 0.7964, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.48895432647000003, + "learning_rate": 0.00018216223126204007, + "loss": 0.8323, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.46953643840086196, + "learning_rate": 0.00018186561545605054, + "loss": 0.8575, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.46635817800169, + "learning_rate": 0.00018156679948605467, + "loss": 0.7786, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4863706332365572, + "learning_rate": 0.00018126579138282503, + "loss": 0.7945, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.49557735178852874, + "learning_rate": 0.0001809625992360485, + "loss": 0.8256, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5610746106786217, + "learning_rate": 0.00018065723119410884, + "loss": 0.8474, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.45160520338706195, + "learning_rate": 0.00018034969546386757, + "loss": 0.7835, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4365881164231204, + "learning_rate": 0.0001800400003104436, + "loss": 0.7897, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5502156329221844, + "learning_rate": 0.00017972815405699103, + "loss": 0.843, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.46958433144630546, + "learning_rate": 0.00017941416508447536, + "loss": 0.8087, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.47221562406214135, + "learning_rate": 0.0001790980418314484, + "loss": 0.8508, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5237199615035483, + "learning_rate": 0.00017877979279382135, + "loss": 0.8, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4376415127234786, + "learning_rate": 0.0001784594265246366, + "loss": 0.7777, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4169761952295241, + "learning_rate": 0.0001781369516338378, + "loss": 0.7176, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.54410249493475, + "learning_rate": 0.00017781237678803847, + "loss": 0.8416, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.47043227050187336, + "learning_rate": 0.000177485710710289, + "loss": 0.7098, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.40960863633131694, + "learning_rate": 0.00017715696217984235, + "loss": 0.7343, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.5067507056604774, + "learning_rate": 0.00017682614003191807, + "loss": 0.8486, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4245163031138323, + "learning_rate": 0.00017649325315746478, + "loss": 0.7495, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.4556737057249965, + "learning_rate": 0.0001761583105029213, + "loss": 0.8479, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4493326167767346, + "learning_rate": 0.00017582132106997616, + "loss": 0.7581, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.3716425384364705, + "learning_rate": 0.00017548229391532572, + "loss": 0.7431, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.444819110511713, + "learning_rate": 0.00017514123815043074, + "loss": 0.7672, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.6455832769852934, + "learning_rate": 0.00017479816294127152, + "loss": 0.8011, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.47385823913702435, + "learning_rate": 0.0001744530775081015, + "loss": 0.7675, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5493023801642081, + "learning_rate": 0.0001741059911251997, + "loss": 0.9084, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4614982241890986, + "learning_rate": 0.000173756913120621, + "loss": 0.8556, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5027741137636845, + "learning_rate": 0.00017340585287594604, + "loss": 0.9164, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4127974732677408, + "learning_rate": 0.0001730528198260285, + "loss": 0.746, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.4174843002501883, + "learning_rate": 0.00017269782345874203, + "loss": 0.7914, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5144471825402751, + "learning_rate": 0.00017234087331472497, + "loss": 0.8316, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4239583452770837, + "learning_rate": 0.00017198197898712404, + "loss": 0.7628, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5609278354318298, + "learning_rate": 0.00017162115012133643, + "loss": 0.9009, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5569568486822811, + "learning_rate": 0.00017125839641475072, + "loss": 0.8878, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.4666621535700966, + "learning_rate": 0.00017089372761648616, + "loss": 0.7921, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4428243771161841, + "learning_rate": 0.00017052715352713075, + "loss": 0.7353, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4714927066425922, + "learning_rate": 0.00017015868399847768, + "loss": 0.7925, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.5064662007808188, + "learning_rate": 0.00016978832893326074, + "loss": 0.8268, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5196635747410672, + "learning_rate": 0.00016941609828488807, + "loss": 0.785, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.4933582591088359, + "learning_rate": 0.0001690420020571747, + "loss": 0.7448, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5666841217272961, + "learning_rate": 0.0001686660503040737, + "loss": 0.8215, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.44991866863529384, + "learning_rate": 0.00016828825312940592, + "loss": 0.74, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5370065695691301, + "learning_rate": 0.0001679086206865886, + "loss": 0.8109, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.48386915891784066, + "learning_rate": 0.00016752716317836229, + "loss": 0.8156, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.47624527776429076, + "learning_rate": 0.0001671438908565167, + "loss": 0.863, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.46309055722372316, + "learning_rate": 0.00016675881402161536, + "loss": 0.7133, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5546169176698907, + "learning_rate": 0.0001663719430227186, + "loss": 0.8498, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.5552794343943931, + "learning_rate": 0.00016598328825710533, + "loss": 0.8429, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.42596936374424016, + "learning_rate": 0.000165592860169994, + "loss": 0.7344, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.4780424925511933, + "learning_rate": 0.00016520066925426144, + "loss": 0.7698, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4642570392124382, + "learning_rate": 0.0001648067260501611, + "loss": 0.8356, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.44471874740393336, + "learning_rate": 0.0001644110411450398, + "loss": 0.8078, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.41678047427736536, + "learning_rate": 0.00016401362517305296, + "loss": 0.7649, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5060691569695956, + "learning_rate": 0.00016361448881487914, + "loss": 0.7913, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.5092570360201568, + "learning_rate": 0.00016321364279743266, + "loss": 0.7979, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.5519518990796427, + "learning_rate": 0.0001628110978935756, + "loss": 0.8106, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4608973951013133, + "learning_rate": 0.00016240686492182804, + "loss": 0.7798, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.438865871044298, + "learning_rate": 0.00016200095474607753, + "loss": 0.7487, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.44750886312268195, + "learning_rate": 0.00016159337827528685, + "loss": 0.8061, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.45558219396674005, + "learning_rate": 0.0001611841464632011, + "loss": 0.7705, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.46146903515791504, + "learning_rate": 0.0001607732703080532, + "loss": 0.7486, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.42258959370942567, + "learning_rate": 0.00016036076085226814, + "loss": 0.6807, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4420660188622748, + "learning_rate": 0.0001599466291821666, + "loss": 0.8124, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.44872318502255715, + "learning_rate": 0.0001595308864276666, + "loss": 0.7467, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.48951210749650576, + "learning_rate": 0.0001591135437619847, + "loss": 0.7335, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.5051382436063975, + "learning_rate": 0.0001586946124013354, + "loss": 0.8258, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4126253533370679, + "learning_rate": 0.0001582741036046301, + "loss": 0.7109, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.39969454905380336, + "learning_rate": 0.00015785202867317407, + "loss": 0.7345, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4369673430067481, + "learning_rate": 0.00015742839895036305, + "loss": 0.6952, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.5253068641077018, + "learning_rate": 0.00015700322582137827, + "loss": 0.8515, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3759400648888865, + "learning_rate": 0.0001565765207128805, + "loss": 0.7759, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.38825750674357584, + "learning_rate": 0.0001561482950927029, + "loss": 0.697, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.43550078166914036, + "learning_rate": 0.00015571856046954285, + "loss": 0.7519, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4664952435365108, + "learning_rate": 0.00015528732839265272, + "loss": 0.7405, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.5219653886563514, + "learning_rate": 0.0001548546104515294, + "loss": 0.864, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5150644370748351, + "learning_rate": 0.00015442041827560274, + "loss": 0.8156, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.38827105265454487, + "learning_rate": 0.00015398476353392323, + "loss": 0.7163, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4520282940991433, + "learning_rate": 0.00015354765793484834, + "loss": 0.7775, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4019485406898617, + "learning_rate": 0.00015310911322572753, + "loss": 0.7576, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.4715011902057589, + "learning_rate": 0.000152669141192587, + "loss": 0.722, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5281060367751487, + "learning_rate": 0.00015222775365981273, + "loss": 0.8446, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.5490901682193271, + "learning_rate": 0.00015178496248983254, + "loss": 0.7485, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.5550854172212718, + "learning_rate": 0.00015134077958279765, + "loss": 0.8158, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5539551532383808, + "learning_rate": 0.00015089521687626243, + "loss": 0.8125, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.45711728899539267, + "learning_rate": 0.000150448286344864, + "loss": 0.8025, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4557303163398866, + "learning_rate": 0.00015000000000000001, + "loss": 0.823, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.42817988049854483, + "learning_rate": 0.00014955036988950618, + "loss": 0.8055, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.405898803334239, + "learning_rate": 0.00014909940809733222, + "loss": 0.7766, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4070049345009079, + "learning_rate": 0.00014864712674321734, + "loss": 0.7065, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.4775835805503555, + "learning_rate": 0.00014819353798236427, + "loss": 0.8062, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.42928661633445536, + "learning_rate": 0.00014773865400511272, + "loss": 0.7561, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.5839596443542385, + "learning_rate": 0.00014728248703661182, + "loss": 0.806, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.5608646212586603, + "learning_rate": 0.00014682504933649144, + "loss": 0.7526, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5443624713634069, + "learning_rate": 0.00014636635319853275, + "loss": 0.8007, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.4708628421536427, + "learning_rate": 0.00014590641095033787, + "loss": 0.7315, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3602042451103198, + "learning_rate": 0.00014544523495299842, + "loss": 0.7519, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4600262840841861, + "learning_rate": 0.0001449828376007636, + "loss": 0.7585, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.45411860254567327, + "learning_rate": 0.0001445192313207067, + "loss": 0.7386, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.5952105328243131, + "learning_rate": 0.0001440544285723915, + "loss": 0.7995, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.4315997635636127, + "learning_rate": 0.00014358844184753712, + "loss": 0.7063, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4457882641252247, + "learning_rate": 0.00014312128366968243, + "loss": 0.7835, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.4051747892824553, + "learning_rate": 0.00014265296659384956, + "loss": 0.7272, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.43009388687261124, + "learning_rate": 0.00014218350320620624, + "loss": 0.6833, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.43541512528849646, + "learning_rate": 0.0001417129061237278, + "loss": 0.786, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.5336227600402277, + "learning_rate": 0.00014124118799385796, + "loss": 0.8376, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.514637844230442, + "learning_rate": 0.00014076836149416887, + "loss": 0.8645, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.460137726129797, + "learning_rate": 0.0001402944393320206, + "loss": 0.7485, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.6221242818247743, + "learning_rate": 0.00013981943424421932, + "loss": 0.7667, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4618588089300471, + "learning_rate": 0.00013934335899667527, + "loss": 0.8215, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.4285571304149657, + "learning_rate": 0.00013886622638405952, + "loss": 0.751, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.40258047650756856, + "learning_rate": 0.00013838804922946027, + "loss": 0.719, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.41924632079464363, + "learning_rate": 0.00013790884038403795, + "loss": 0.7204, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4180521137734907, + "learning_rate": 0.00013742861272668012, + "loss": 0.7032, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.5591578173202266, + "learning_rate": 0.00013694737916365517, + "loss": 0.8644, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.5129824870743946, + "learning_rate": 0.00013646515262826552, + "loss": 0.8502, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.6307386849527056, + "learning_rate": 0.0001359819460805001, + "loss": 0.8539, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.46736986952092735, + "learning_rate": 0.0001354977725066859, + "loss": 0.7708, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4799931475937041, + "learning_rate": 0.00013501264491913906, + "loss": 0.8015, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.43476429346809675, + "learning_rate": 0.0001345265763558152, + "loss": 0.8577, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.45237012150052847, + "learning_rate": 0.00013403957987995882, + "loss": 0.7461, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.5050499956650181, + "learning_rate": 0.0001335516685797525, + "loss": 0.8059, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4862202647945296, + "learning_rate": 0.00013306285556796495, + "loss": 0.8695, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.40591057767369276, + "learning_rate": 0.00013257315398159864, + "loss": 0.6893, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.4108968185982489, + "learning_rate": 0.00013208257698153677, + "loss": 0.7418, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.5100121891818531, + "learning_rate": 0.00013159113775218964, + "loss": 0.7532, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.44386921266144647, + "learning_rate": 0.00013109884950114007, + "loss": 0.7848, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4528466297685063, + "learning_rate": 0.00013060572545878875, + "loss": 0.7398, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4491985544541127, + "learning_rate": 0.00013011177887799845, + "loss": 0.7668, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.46701793529554486, + "learning_rate": 0.00012961702303373795, + "loss": 0.742, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.42062540861302994, + "learning_rate": 0.00012912147122272523, + "loss": 0.7292, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.42052169271938605, + "learning_rate": 0.00012862513676307008, + "loss": 0.723, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.415839073871717, + "learning_rate": 0.00012812803299391628, + "loss": 0.7236, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4914666164380902, + "learning_rate": 0.00012763017327508305, + "loss": 0.7554, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.46799457192175714, + "learning_rate": 0.0001271315709867059, + "loss": 0.7626, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.410108044119609, + "learning_rate": 0.00012663223952887723, + "loss": 0.8287, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4091253324705526, + "learning_rate": 0.00012613219232128608, + "loss": 0.6915, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.5009956691028776, + "learning_rate": 0.00012563144280285741, + "loss": 0.8084, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.41570873163794475, + "learning_rate": 0.00012513000443139112, + "loss": 0.7516, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4489434399582018, + "learning_rate": 0.00012462789068320017, + "loss": 0.7754, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.4131301748977756, + "learning_rate": 0.00012412511505274844, + "loss": 0.7577, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4472588663356602, + "learning_rate": 0.00012362169105228826, + "loss": 0.7858, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5565824209551299, + "learning_rate": 0.000123117632211497, + "loss": 0.7866, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.43623811369525867, + "learning_rate": 0.00012261295207711346, + "loss": 0.7859, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.6975936438544309, + "learning_rate": 0.0001221076642125742, + "loss": 0.7157, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.42423336466106243, + "learning_rate": 0.00012160178219764837, + "loss": 0.6974, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.40055370404173907, + "learning_rate": 0.00012109531962807332, + "loss": 0.7262, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.39890372346188857, + "learning_rate": 0.00012058829011518896, + "loss": 0.7396, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.5096735125273331, + "learning_rate": 0.00012008070728557186, + "loss": 0.778, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3795017666300686, + "learning_rate": 0.00011957258478066931, + "loss": 0.7481, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.4108604523234828, + "learning_rate": 0.00011906393625643244, + "loss": 0.7503, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.36367980390858484, + "learning_rate": 0.00011855477538294935, + "loss": 0.7245, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.3977646017526582, + "learning_rate": 0.00011804511584407763, + "loss": 0.7559, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.39638159967335806, + "learning_rate": 0.00011753497133707679, + "loss": 0.7583, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.42130562584263453, + "learning_rate": 0.00011702435557223987, + "loss": 0.7599, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.4077415100488722, + "learning_rate": 0.00011651328227252517, + "loss": 0.7852, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4456509250757706, + "learning_rate": 0.00011600176517318741, + "loss": 0.7719, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.46395650572041225, + "learning_rate": 0.00011548981802140848, + "loss": 0.813, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.5392629108164115, + "learning_rate": 0.00011497745457592816, + "loss": 0.8178, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4077572738017568, + "learning_rate": 0.00011446468860667421, + "loss": 0.6964, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.40929898598852493, + "learning_rate": 0.00011395153389439233, + "loss": 0.721, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.46110445447232007, + "learning_rate": 0.00011343800423027582, + "loss": 0.7109, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.359324747412484, + "learning_rate": 0.0001129241134155949, + "loss": 0.7509, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.8336808335526302, + "learning_rate": 0.00011240987526132594, + "loss": 0.8277, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4620609468430462, + "learning_rate": 0.00011189530358778005, + "loss": 0.7921, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.48121311623043744, + "learning_rate": 0.00011138041222423177, + "loss": 0.7093, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.4350895665343257, + "learning_rate": 0.00011086521500854745, + "loss": 0.7967, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.5679574902032004, + "learning_rate": 0.00011034972578681338, + "loss": 0.8072, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.46950631028416706, + "learning_rate": 0.00010983395841296348, + "loss": 0.7734, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.570490021193672, + "learning_rate": 0.00010931792674840718, + "loss": 0.7301, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.40568313662242395, + "learning_rate": 0.00010880164466165674, + "loss": 0.6713, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.47606013944566766, + "learning_rate": 0.00010828512602795462, + "loss": 0.7327, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.44278344909344114, + "learning_rate": 0.00010776838472890065, + "loss": 0.7389, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4626487984327057, + "learning_rate": 0.00010725143465207867, + "loss": 0.7148, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.41413130581808577, + "learning_rate": 0.00010673428969068364, + "loss": 0.7274, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.4690577728618916, + "learning_rate": 0.00010621696374314807, + "loss": 0.7047, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.37772131967552147, + "learning_rate": 0.00010569947071276847, + "loss": 0.7371, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5241995894421716, + "learning_rate": 0.00010518182450733186, + "loss": 0.8145, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.45138163052349395, + "learning_rate": 0.00010466403903874176, + "loss": 0.7504, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.49809750128277064, + "learning_rate": 0.00010414612822264455, + "loss": 0.7833, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.3862594160358528, + "learning_rate": 0.00010362810597805526, + "loss": 0.6885, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3912616000490057, + "learning_rate": 0.0001031099862269837, + "loss": 0.7126, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.42671759932825865, + "learning_rate": 0.00010259178289406011, + "loss": 0.7653, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.43525088205981943, + "learning_rate": 0.00010207350990616107, + "loss": 0.6812, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.44353296841742135, + "learning_rate": 0.0001015551811920351, + "loss": 0.7444, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.40347951408980726, + "learning_rate": 0.00010103681068192845, + "loss": 0.7487, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.407563259800527, + "learning_rate": 0.00010051841230721065, + "loss": 0.6918, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4825945149443946, + "learning_rate": 0.0001, + "loss": 0.8286, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.46888527256064205, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7278, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.35415300739627886, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7081, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.4174570287002455, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7862, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5042407232677681, + "learning_rate": 9.792649009383899e-05, + "loss": 0.8516, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4019155298012977, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6205, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.47936001841636144, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6954, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.45559451960122294, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7448, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.3725050996355319, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6761, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.43152640749522186, + "learning_rate": 9.533596096125825e-05, + "loss": 0.8228, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5019787195538405, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6953, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3933828775717379, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6986, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.46109184390562064, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7974, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.3751402629468758, + "learning_rate": 9.326571030931637e-05, + "loss": 0.702, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.41075744550477633, + "learning_rate": 9.274856534792138e-05, + "loss": 0.715, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.5443171673871285, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7848, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4429556434128808, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7455, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.4083669981733497, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6803, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.4715431488867771, + "learning_rate": 9.068207325159284e-05, + "loss": 0.8047, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.38448011267664034, + "learning_rate": 9.016604158703654e-05, + "loss": 0.713, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.42425944294235907, + "learning_rate": 8.965027421318665e-05, + "loss": 0.748, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4379812604643885, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7697, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.6209340714517821, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7874, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.44201198942465997, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7699, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4764254770834733, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7901, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.4096441586651275, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7228, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3969513853243152, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7529, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.5365493484110804, + "learning_rate": 8.604846610560771e-05, + "loss": 0.8709, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.43614872993323006, + "learning_rate": 8.553531139332582e-05, + "loss": 0.676, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.5092788403946433, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6988, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.410847615267607, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7271, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.47391115175094317, + "learning_rate": 8.399823482681262e-05, + "loss": 0.7974, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.5095539719795574, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7132, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.4571194378914754, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6577, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.47737735497939765, + "learning_rate": 8.246502866292324e-05, + "loss": 0.784, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.38353722509787763, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6723, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5695389740488869, + "learning_rate": 8.144522461705067e-05, + "loss": 0.8151, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5391637529898152, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7274, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.4006072175621919, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7434, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4222671576070959, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7428, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3976579330761039, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7616, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.42255730442297545, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7609, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.41246860766662846, + "learning_rate": 7.839821780235168e-05, + "loss": 0.7043, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.4910922066018139, + "learning_rate": 7.789233578742582e-05, + "loss": 0.8279, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4583975979375754, + "learning_rate": 7.738704792288655e-05, + "loss": 0.768, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.39329145110417946, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6895, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3889200786137619, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7061, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4678598370358913, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7445, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.5114556160689178, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7812, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.46430482828067804, + "learning_rate": 7.48699955686089e-05, + "loss": 0.8086, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4652368825183028, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7996, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4156483527126143, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7143, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.42072838682916724, + "learning_rate": 7.336776047112276e-05, + "loss": 0.757, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.4610948866880415, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7254, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5409017437356095, + "learning_rate": 7.236982672491698e-05, + "loss": 0.78, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.32624499419219716, + "learning_rate": 7.187196700608373e-05, + "loss": 0.62, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.38165761820976635, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7645, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3828166260395758, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6832, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.4046787667999106, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6513, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.3643996551478938, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6731, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5878438451616897, + "learning_rate": 6.939427454121128e-05, + "loss": 0.9034, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.43507716349366665, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7459, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4675199604593121, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6524, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.5220350889448834, + "learning_rate": 6.791742301846326e-05, + "loss": 0.8117, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.38381197210212664, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7285, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.39760158207342916, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7173, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3884490761221106, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7115, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.43957523114488356, + "learning_rate": 6.59604201200412e-05, + "loss": 0.798, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.4333978171844909, + "learning_rate": 6.547342364418481e-05, + "loss": 0.8164, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.3732803287774478, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7454, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3794733664568056, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7613, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.41549742261080647, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7869, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4855203473158915, + "learning_rate": 6.35348473717345e-05, + "loss": 0.8364, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.5197083474910514, + "learning_rate": 6.305262083634488e-05, + "loss": 0.8886, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.42369930329880323, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7242, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.4699463590136928, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7397, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.431419333342201, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7159, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.387076267269984, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6996, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.39386317351097094, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6947, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4080080280633503, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7187, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4418437822394713, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7051, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.39331968967272185, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6688, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4753675991971554, + "learning_rate": 5.875881200614207e-05, + "loss": 0.747, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.4301540427697761, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6812, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.44731642973716046, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7549, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.43316276103068396, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7612, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5390689427095837, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7472, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.40013520823674165, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6761, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.39368096794889446, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7313, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.46588282500462086, + "learning_rate": 5.54807686792933e-05, + "loss": 0.785, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.456856055688501, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6943, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4852444095134256, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7224, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4396387069700316, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7541, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.4586911399489298, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7346, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.38451486127934603, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7263, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3734322129644761, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7304, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4219042551455218, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6699, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.44381594455152107, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7627, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.41809763823810103, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6691, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4663430287654327, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7223, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.5170665655234487, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6877, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.5405806804681932, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7992, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.34421740208058965, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6877, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.4252508028678407, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6885, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.6942440541587961, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7351, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.49142346057250874, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7834, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.40869861841731175, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7305, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.5489019189247966, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7375, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.5050582962020529, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7412, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.4262794392530901, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7407, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5015819918976162, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.8035, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.43399711540507313, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7248, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4359167953464953, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6538, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.5158105983586037, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7668, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.393681478987024, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6613, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.6203780278226882, + "learning_rate": 4.385170490729712e-05, + "loss": 0.8716, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.44289616391666736, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6449, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.42697599683590876, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.8015, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.4410676387461542, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7096, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3890591971991562, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7553, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3943773090859102, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6896, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3952617637753136, + "learning_rate": 4.130538759866457e-05, + "loss": 0.675, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4841894572166325, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7346, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.4366709343004436, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7471, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.42191672845476863, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6509, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3775530356391045, + "learning_rate": 3.963923914773187e-05, + "loss": 0.733, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.39105134753052495, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6837, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.43353346912673657, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6601, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.5872624794785681, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7603, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3759509267407406, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6827, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.38646793874726243, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6838, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.41969699299771746, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7628, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4213752239198554, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7109, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.4646135767858213, + "learning_rate": 3.638551118512089e-05, + "loss": 0.652, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.45933925738321174, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.809, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.41950659530606604, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7318, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4390391420303596, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6524, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4205145624082347, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7154, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.43775520420161573, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7427, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.443039095805843, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7381, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.40446606491238196, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7041, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.4565815989336157, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7207, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.40131850955302295, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6876, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.44971058723052154, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7439, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.46751286290328736, + "learning_rate": 3.209137931341143e-05, + "loss": 0.7202, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.38823586195199966, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6626, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4451744270156566, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7288, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.6022477158409618, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.796, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.539410763170956, + "learning_rate": 3.058390171511196e-05, + "loss": 0.69, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.45742363917437995, + "learning_rate": 3.021167106673928e-05, + "loss": 0.744, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.47344603336882474, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.728, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4135333089243574, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6671, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.43002045441598563, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6231, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.42317314012568297, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7195, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.46626381925481214, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6961, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4136562321861586, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6987, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4784975974141901, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6789, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.40533604299838283, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7154, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.4206369423773519, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6601, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.42757838826384376, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6631, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.5391649221416999, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7564, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.3904100841608839, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6596, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.42062545130796897, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7431, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.4857018013340481, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7497, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.36259452986829344, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6305, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.37515589328507154, + "learning_rate": 2.451770608467432e-05, + "loss": 0.7014, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.42845416708723516, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6606, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4553734430542505, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.763, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.40015155573782624, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7534, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5208900301952243, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7815, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3503748165237743, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6909, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3908558731476936, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6813, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4687428745927545, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6946, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.4332820639854068, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7331, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.42723456851448505, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6618, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.46402871060489587, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6508, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.40578292960530027, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7229, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.43520879496687526, + "learning_rate": 2.058583491552465e-05, + "loss": 0.7499, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.4536989641209546, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6821, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.542222972927613, + "learning_rate": 1.995999968955641e-05, + "loss": 0.8348, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.39390918306234635, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7415, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4533121637690508, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7509, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4687709391858201, + "learning_rate": 1.903740076395151e-05, + "loss": 0.8083, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.49228224147446814, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7353, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4758894189648476, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7248, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.3559686727798047, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6908, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4770784964280279, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6538, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.7078603935083368, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6817, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.39213160731883573, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7589, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.47020008634550475, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7336, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3792146423670458, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6326, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4262609061539926, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6411, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.42577224096572075, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6806, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.42374880063325143, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.681, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3952042103509857, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6542, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.3616527944352411, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6835, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5241196153307605, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7916, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.5390849479993916, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7987, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.48039231414527545, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6692, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.42707179923530125, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6568, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.46109134311696376, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7601, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.47142533598707004, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6752, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4504467554154392, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7277, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.3831266438749168, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6506, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.40016219076978526, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.682, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.44514968765868584, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7746, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3843751561475455, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7161, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.4669995535333905, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6942, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.4529295547385918, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6589, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.482964134784402, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.762, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.41287764501078317, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6815, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.4098495155093636, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.741, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.41847355992725155, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7382, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.4653257835694142, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.7441, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.43252875114325257, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6772, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.40733522956105084, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7139, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4956527752274115, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7383, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.39614957218583297, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6676, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.3753280480518952, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6703, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.475121842569078, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7619, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3521757249003535, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6272, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3764674116747337, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6801, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5801586357247913, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6769, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.44486716819587074, + "learning_rate": 8.47755379734373e-06, + "loss": 0.713, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.44155527404950856, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6813, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.44335611847768286, + "learning_rate": 8.064696101776358e-06, + "loss": 0.702, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.46990914035756637, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6971, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.4321875894671604, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7515, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.4252627880976508, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6518, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.39611014885838497, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6757, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4095060338394822, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5828, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.41851390395160787, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.669, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.37014421972947137, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6667, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.39958569740213346, + "learning_rate": 6.512524116523633e-06, + "loss": 0.7322, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4151542139680637, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6759, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.4469625332463983, + "learning_rate": 6.149504395842087e-06, + "loss": 0.8135, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4066896591691095, + "learning_rate": 5.971775505458444e-06, + "loss": 0.7106, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.4163489054937086, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7317, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.36587621421685, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6346, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.37495976884733584, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6356, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3925825846368969, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6646, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.49011988410515533, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7772, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.38832737961220826, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6655, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.3968057567239704, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6856, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.48571992999735075, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6528, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.42691389717667355, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6593, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.45668408205123967, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.8113, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.40497751915448754, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7361, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.39305353036446256, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6766, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.464968814745453, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6357, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4401056844056556, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7237, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.43214470617398676, + "learning_rate": 3.611599153858214e-06, + "loss": 0.642, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4230345584965771, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.654, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.4304618514743957, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6704, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4125636074837623, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7088, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.5083751865235997, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6955, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.4480447682345799, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7025, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4669983763007555, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.686, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.42135345865339036, + "learning_rate": 2.708812932856253e-06, + "loss": 0.724, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4415599946558524, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6629, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5102318567062917, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.726, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4339243196871157, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6724, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4587114055215017, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6998, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.4279050205280435, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6884, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.44477451329874085, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6413, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.5444579696189681, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6767, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.42887921931200046, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.7089, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.4004212833108163, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6784, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.4173772009199892, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.7361, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.5019148581562954, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.7951, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.5466510572971621, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7459, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.5495540131846346, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7266, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.402119858865409, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6183, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.4746782428308101, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.7274, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.36514268392405524, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6576, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.47592924104667145, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.689, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.39293283630147624, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6597, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.45086953532530955, + "learning_rate": 9.070131527609604e-07, + "loss": 0.7274, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.44438641462434025, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6614, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.4162291164037949, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6342, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5466716041149762, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6394, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.42271815559211845, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7811, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4244644129053721, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6654, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.42256064647922514, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6945, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.6239794621223662, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7737, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4172502376073947, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6797, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.39826667579957786, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6891, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4601710262147938, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7329, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.5297655920286328, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.7912, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3757166865051945, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6968, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3936377352463753, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6908, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.43741423927983253, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6844, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.47345694727007964, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.7313, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.32409034288972116, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6177, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.43997495517123886, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.7024, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.5443286523974251, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6954, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4949478207365622, + "learning_rate": 6.583743778106887e-08, + "loss": 0.7485, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.40186144816647534, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7105, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.37852597841630553, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6434, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.44720806471916646, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6678, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.400060471651697, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6656, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4990555952833043, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7616, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.5903516833425473, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.8112, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.43333961125183756, + "learning_rate": 0.0, + "loss": 0.7567, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 543415710547968.0, + "train_loss": 0.769115243434906, + "train_runtime": 9658.4573, + "train_samples_per_second": 1.035, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 543415710547968.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf42828caa84117ccf1e34e7e03ab421595fd65b --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "v_proj", + "down_proj", + "gate_proj", + "o_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bd16e72d0ffe5a221179f5c9ec253efeacb775f7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1b105009b66ec919db82ffe954b793fe0f891faa335074e2968b8ffe2b62b8e +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..c372d246968cddbf373a89266de2c1160f262c57 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb9795a816cf36277735654517284cac53c8a54b525688e6f5e38c12eb631a9 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..839168a196563227f5e7c111544d612590d4674f --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_2_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9103283675313728, + "learning_rate": 2e-05, + "loss": 1.4166, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9144320274792768, + "learning_rate": 4e-05, + "loss": 1.4921, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7274963361526622, + "learning_rate": 6e-05, + "loss": 1.3633, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7533872931679354, + "learning_rate": 8e-05, + "loss": 1.3309, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.8072799234288184, + "learning_rate": 0.0001, + "loss": 1.1534, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.8128649163781364, + "learning_rate": 0.00012, + "loss": 1.0277, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8094747897805041, + "learning_rate": 0.00014, + "loss": 1.0626, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6501117061319456, + "learning_rate": 0.00016, + "loss": 0.9675, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4844842936596786, + "learning_rate": 0.00018, + "loss": 0.9223, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.42169366246618345, + "learning_rate": 0.0002, + "loss": 0.9691, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.40603834149407325, + "learning_rate": 0.00019999458931878073, + "loss": 0.9463, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.45702055345432224, + "learning_rate": 0.0001999783578606323, + "loss": 0.9557, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.42227819444173914, + "learning_rate": 0.00019995130738201966, + "loss": 0.9032, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5015499256486801, + "learning_rate": 0.0001999134408101731, + "loss": 0.9017, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.4590177978923712, + "learning_rate": 0.00019986476224277165, + "loss": 0.9441, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4696048910755479, + "learning_rate": 0.00019980527694749952, + "loss": 0.9416, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4382509564334526, + "learning_rate": 0.00019973499136147606, + "loss": 0.8984, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.38129492603906207, + "learning_rate": 0.0001996539130905593, + "loss": 0.8963, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.38394803376304665, + "learning_rate": 0.0001995620509085228, + "loss": 0.8606, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.4387045971731528, + "learning_rate": 0.00019945941475610623, + "loss": 0.9166, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.38444814054921106, + "learning_rate": 0.0001993460157399396, + "loss": 0.8944, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.45143495292294156, + "learning_rate": 0.0001992218661313415, + "loss": 0.8889, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.44336900265005136, + "learning_rate": 0.00019908697936499103, + "loss": 0.8192, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.41164037436606715, + "learning_rate": 0.00019894137003747403, + "loss": 0.8402, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.4334476983961227, + "learning_rate": 0.00019878505390570362, + "loss": 0.9395, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3442486491029148, + "learning_rate": 0.00019861804788521493, + "loss": 0.829, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.37737297293447203, + "learning_rate": 0.00019844037004833473, + "loss": 0.8714, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.36390961631619445, + "learning_rate": 0.00019825203962222572, + "loss": 0.8872, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.3354385508656872, + "learning_rate": 0.0001980530769868059, + "loss": 0.8179, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.38209135195320887, + "learning_rate": 0.00019784350367254322, + "loss": 0.8878, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3637649637601686, + "learning_rate": 0.0001976233423581255, + "loss": 0.8873, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3752837293323211, + "learning_rate": 0.0001973926168680066, + "loss": 0.8485, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.36023976019261966, + "learning_rate": 0.00019715135216982798, + "loss": 0.8172, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.39540517944934683, + "learning_rate": 0.0001968995743717171, + "loss": 0.8485, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.41411169758141353, + "learning_rate": 0.00019663731071946206, + "loss": 0.9169, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.43753763956502895, + "learning_rate": 0.00019636458959356316, + "loss": 0.8761, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3122556706775927, + "learning_rate": 0.0001960814405061619, + "loss": 0.8011, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3340338425438589, + "learning_rate": 0.00019578789409784727, + "loss": 0.7981, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.33378726558197863, + "learning_rate": 0.00019548398213434007, + "loss": 0.777, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.3620481789765836, + "learning_rate": 0.00019516973750305532, + "loss": 0.7779, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.38288334079362785, + "learning_rate": 0.00019484519420954354, + "loss": 0.7896, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.33613117188038577, + "learning_rate": 0.00019451038737381077, + "loss": 0.8166, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.33308279105593896, + "learning_rate": 0.00019416535322651818, + "loss": 0.7427, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.36908712891619183, + "learning_rate": 0.00019381012910506146, + "loss": 0.8754, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.37395396744736786, + "learning_rate": 0.00019344475344953012, + "loss": 0.749, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3160307397453434, + "learning_rate": 0.00019306926579854821, + "loss": 0.8054, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.36749447963029164, + "learning_rate": 0.00019268370678499533, + "loss": 0.7574, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.33549636928097215, + "learning_rate": 0.0001922881181316097, + "loss": 0.7923, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.37868595879272665, + "learning_rate": 0.00019188254264647337, + "loss": 0.8073, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.38925373377460204, + "learning_rate": 0.0001914670242183795, + "loss": 0.8455, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.3672188908686244, + "learning_rate": 0.0001910416078120832, + "loss": 0.8705, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.33169876099654205, + "learning_rate": 0.0001906063394634356, + "loss": 0.7603, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3795202679699071, + "learning_rate": 0.00019016126627440237, + "loss": 0.8117, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3449122176312621, + "learning_rate": 0.00018970643640796642, + "loss": 0.8594, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.33161046480987155, + "learning_rate": 0.000189241899082916, + "loss": 0.7841, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.32548937249448306, + "learning_rate": 0.00018876770456851877, + "loss": 0.7966, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.360653048373736, + "learning_rate": 0.0001882839041790818, + "loss": 0.82, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.351951999314567, + "learning_rate": 0.00018779055026839868, + "loss": 0.7742, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3701117149066087, + "learning_rate": 0.00018728769622408423, + "loss": 0.7952, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.32825144421037894, + "learning_rate": 0.00018677539646179707, + "loss": 0.8084, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3321888433864363, + "learning_rate": 0.00018625370641935129, + "loss": 0.7646, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3743402139687879, + "learning_rate": 0.00018572268255071718, + "loss": 0.8244, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3514231301804191, + "learning_rate": 0.00018518238231991218, + "loss": 0.8038, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3933077216029149, + "learning_rate": 0.00018463286419478255, + "loss": 0.879, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.3549182509819441, + "learning_rate": 0.00018407418764067627, + "loss": 0.8278, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.34188248262920085, + "learning_rate": 0.00018350641311400812, + "loss": 0.7532, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.3729093320580842, + "learning_rate": 0.0001829296020557174, + "loss": 0.878, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.33083049897801425, + "learning_rate": 0.00018234381688461942, + "loss": 0.7991, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3546403903312824, + "learning_rate": 0.0001817491209906506, + "loss": 0.8093, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.33586476961490047, + "learning_rate": 0.00018114557872800905, + "loss": 0.7997, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3488750651611648, + "learning_rate": 0.00018053325540819045, + "loss": 0.8055, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3269454217721887, + "learning_rate": 0.0001799122172929206, + "loss": 0.8059, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3526119888506058, + "learning_rate": 0.00017928253158698473, + "loss": 0.8178, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.34485108812383425, + "learning_rate": 0.0001786442664309554, + "loss": 0.7827, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.3403414943684799, + "learning_rate": 0.0001779974908938184, + "loss": 0.7709, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.30538519888266924, + "learning_rate": 0.0001773422749654988, + "loss": 0.7102, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3423028009161411, + "learning_rate": 0.00017667868954928694, + "loss": 0.7889, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3382000191220972, + "learning_rate": 0.00017600680645416583, + "loss": 0.7956, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.3046490107370409, + "learning_rate": 0.00017532669838704035, + "loss": 0.7416, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.4203866308689628, + "learning_rate": 0.00017463843894486937, + "loss": 0.7684, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4060027558515808, + "learning_rate": 0.0001739421026067017, + "loss": 0.8698, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.33995904638689217, + "learning_rate": 0.00017323776472561627, + "loss": 0.8143, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3588441569836783, + "learning_rate": 0.00017252550152056795, + "loss": 0.7956, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.3674202406542926, + "learning_rate": 0.0001718053900681397, + "loss": 0.8214, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.37482770602753407, + "learning_rate": 0.00017107750829420176, + "loss": 0.8209, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3464001739697096, + "learning_rate": 0.00017034193496547902, + "loss": 0.7486, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.37285581487412267, + "learning_rate": 0.00016959874968102735, + "loss": 0.791, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3979485453242663, + "learning_rate": 0.00016884803286362, + "loss": 0.7703, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3507154145343656, + "learning_rate": 0.00016808986575104465, + "loss": 0.7614, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.34617791618977534, + "learning_rate": 0.00016732433038731242, + "loss": 0.8254, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.37892909020804283, + "learning_rate": 0.0001665515096137797, + "loss": 0.7727, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3361348593587784, + "learning_rate": 0.00016577148706018328, + "loss": 0.7759, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3381620224272412, + "learning_rate": 0.00016498434713559088, + "loss": 0.7888, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3062100919489959, + "learning_rate": 0.00016419017501926656, + "loss": 0.7714, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.3849578010135168, + "learning_rate": 0.0001633890566514535, + "loss": 0.7798, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3650869029786026, + "learning_rate": 0.00016258107872407375, + "loss": 0.7793, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4220453796587287, + "learning_rate": 0.0001617663286713474, + "loss": 0.7737, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3445256004727701, + "learning_rate": 0.00016094489466033043, + "loss": 0.7471, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3195235032079133, + "learning_rate": 0.00016011686558137448, + "loss": 0.7382, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.3413154323610123, + "learning_rate": 0.0001592823310385073, + "loss": 0.7305, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.32466681948160775, + "learning_rate": 0.0001584413813397364, + "loss": 0.756, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.30531327493430477, + "learning_rate": 0.00015759410748727662, + "loss": 0.7009, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.35300100306276294, + "learning_rate": 0.00015674060116770236, + "loss": 0.7995, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.2865252501809062, + "learning_rate": 0.00015588095474202595, + "loss": 0.7136, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.38024243713722367, + "learning_rate": 0.00015501526123570277, + "loss": 0.7912, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.33295045305208426, + "learning_rate": 0.00015414361432856475, + "loss": 0.755, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.3071984559961337, + "learning_rate": 0.0001532661083446829, + "loss": 0.7608, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.370771625197189, + "learning_rate": 0.00015238283824216015, + "loss": 0.7725, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3697591742580659, + "learning_rate": 0.00015149389960285558, + "loss": 0.7693, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.36908515335555, + "learning_rate": 0.00015059938862204127, + "loss": 0.7923, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3232317554930679, + "learning_rate": 0.00014969940209799248, + "loss": 0.7994, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.2864545104930335, + "learning_rate": 0.00014879403742151283, + "loss": 0.7266, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3281035382183986, + "learning_rate": 0.00014788339256539544, + "loss": 0.7697, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.432073780382289, + "learning_rate": 0.0001469675660738206, + "loss": 0.7696, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.3489210583638896, + "learning_rate": 0.00014604665705169237, + "loss": 0.7551, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.29005250465130256, + "learning_rate": 0.00014512076515391375, + "loss": 0.7455, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.3687315971108826, + "learning_rate": 0.00014418999057460276, + "loss": 0.7632, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3196634468449587, + "learning_rate": 0.0001432544340362501, + "loss": 0.7339, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3021021607384978, + "learning_rate": 0.00014231419677881966, + "loss": 0.6969, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.35914168156137066, + "learning_rate": 0.00014136938054879283, + "loss": 0.8014, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.33533749250540656, + "learning_rate": 0.00014042008758815818, + "loss": 0.7945, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.35729590877733236, + "learning_rate": 0.00013946642062334766, + "loss": 0.7868, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.30367986407370473, + "learning_rate": 0.00013850848285411994, + "loss": 0.7287, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.29868048103393635, + "learning_rate": 0.000137546377942393, + "loss": 0.7024, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.4254752746800852, + "learning_rate": 0.00013658021000102636, + "loss": 0.8463, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3701875254407606, + "learning_rate": 0.00013561008358255468, + "loss": 0.8023, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.35082155416776684, + "learning_rate": 0.00013463610366787392, + "loss": 0.8221, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3240073690528122, + "learning_rate": 0.00013365837565488064, + "loss": 0.7686, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.34042694987957256, + "learning_rate": 0.0001326770053470668, + "loss": 0.7697, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3165193575330039, + "learning_rate": 0.0001316920989420703, + "loss": 0.7345, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3164043962116355, + "learning_rate": 0.00013070376302018287, + "loss": 0.7569, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.33222007537208365, + "learning_rate": 0.00012971210453281674, + "loss": 0.7474, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3379494196541218, + "learning_rate": 0.000128717230790931, + "loss": 0.7189, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.36085614892637663, + "learning_rate": 0.00012771924945341906, + "loss": 0.7306, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.31824319737856027, + "learning_rate": 0.00012671826851545851, + "loss": 0.7862, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3355156358831619, + "learning_rate": 0.0001257143962968246, + "loss": 0.7457, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3112035899857585, + "learning_rate": 0.00012470774143016853, + "loss": 0.7573, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.30736498729966194, + "learning_rate": 0.00012369841284926188, + "loss": 0.7667, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.35606634972260365, + "learning_rate": 0.00012268651977720866, + "loss": 0.7795, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.3059543952599057, + "learning_rate": 0.00012167217171462566, + "loss": 0.6992, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.28162119452183293, + "learning_rate": 0.0001206554784277931, + "loss": 0.7221, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.335028879547796, + "learning_rate": 0.00011963654993677645, + "loss": 0.7598, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.2967556622516192, + "learning_rate": 0.00011861549650352069, + "loss": 0.7301, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.28900459549646895, + "learning_rate": 0.00011759242861991855, + "loss": 0.7467, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.29948470291870005, + "learning_rate": 0.00011656745699585371, + "loss": 0.7643, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3289223426110118, + "learning_rate": 0.00011554069254722051, + "loss": 0.7917, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.36344371877255777, + "learning_rate": 0.00011451224638392129, + "loss": 0.7449, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.32629958114495206, + "learning_rate": 0.00011348222979784289, + "loss": 0.7101, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4729809069084755, + "learning_rate": 0.00011245075425081328, + "loss": 0.7837, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.3557131350847042, + "learning_rate": 0.00011141793136253986, + "loss": 0.7462, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3310833706511039, + "learning_rate": 0.0001103838728985307, + "loss": 0.8, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3557288386127655, + "learning_rate": 0.000109348690758, + "loss": 0.7474, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.30640476176594855, + "learning_rate": 0.00010831249696175918, + "loss": 0.694, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.35396779632156705, + "learning_rate": 0.0001072754036400944, + "loss": 0.7205, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.31843226775773836, + "learning_rate": 0.00010623752302063283, + "loss": 0.7112, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.36432332157220065, + "learning_rate": 0.00010519896741619803, + "loss": 0.7683, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.34809268929906395, + "learning_rate": 0.00010415984921265609, + "loss": 0.7658, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3264788391551195, + "learning_rate": 0.00010312028085675391, + "loss": 0.6921, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3400045059101885, + "learning_rate": 0.00010208037484395114, + "loss": 0.719, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.3248741495647313, + "learning_rate": 0.00010104024370624644, + "loss": 0.7398, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.31271136364143504, + "learning_rate": 0.0001, + "loss": 0.7488, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3333951914768973, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7105, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3439039077825358, + "learning_rate": 9.791962515604887e-05, + "loss": 0.8108, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3371739068964004, + "learning_rate": 9.687971914324607e-05, + "loss": 0.6551, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.2919025988586933, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7045, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.36491481768587625, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7529, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.31445880002906634, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7426, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.377819675471851, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7031, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3508448667090174, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7604, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.3163495470425652, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7372, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.310680946219706, + "learning_rate": 8.961612710146934e-05, + "loss": 0.73, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3767362502428194, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7748, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3333698966486753, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7763, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.2953895967701411, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7358, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.3263010222901318, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7688, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3339166843318738, + "learning_rate": 8.445930745277953e-05, + "loss": 0.709, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.39246895356613587, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7528, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.33949846938337874, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7201, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3488329704671586, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7388, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.35937058036793534, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7339, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.30344315288005674, + "learning_rate": 7.934452157220694e-05, + "loss": 0.7444, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.30055346873379907, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7264, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.5512874605658908, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7922, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.28422384034250225, + "learning_rate": 7.630158715073813e-05, + "loss": 0.6951, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.3264059434946543, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7573, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.34136021341909206, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8017, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.31936628762037644, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7352, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3705583084948377, + "learning_rate": 7.228075054658096e-05, + "loss": 0.753, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.2641217408168145, + "learning_rate": 7.1282769209069e-05, + "loss": 0.6945, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.28199637025305835, + "learning_rate": 7.028789546718326e-05, + "loss": 0.6657, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3455130815729331, + "learning_rate": 6.929623697981718e-05, + "loss": 0.786, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.33204650997498225, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6985, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3363804677488833, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7668, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.28777279245420617, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7124, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.32216300619118104, + "learning_rate": 6.536389633212609e-05, + "loss": 0.8082, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.2751244879997451, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7523, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.33953479421377675, + "learning_rate": 6.341978999897365e-05, + "loss": 0.8128, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3698224464316948, + "learning_rate": 6.245362205760704e-05, + "loss": 0.8092, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2933579428054677, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7312, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3017197415602167, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7024, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.31184940241088266, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7134, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.31997021975402207, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7077, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.32029492277494004, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7162, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.34003208753770975, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7546, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.29865775478304046, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.7071, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.35713876514251774, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7462, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.354479165253479, + "learning_rate": 5.395334294830765e-05, + "loss": 0.742, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3072783382105601, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7326, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3066891047944988, + "learning_rate": 5.211660743460458e-05, + "loss": 0.702, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.32137740444038204, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7166, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3226385011981905, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7072, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.30654979020218126, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7463, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.364514113762281, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7154, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.31061468570850853, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7561, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3820596683556745, + "learning_rate": 4.673389165531714e-05, + "loss": 0.742, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3690660332818625, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7749, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3103712139550411, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6921, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.31693912519742806, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7153, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4035650161680625, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7578, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.3155224002650208, + "learning_rate": 4.240589251272342e-05, + "loss": 0.758, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.29153296754586383, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7223, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.36059846357452535, + "learning_rate": 4.071766896149273e-05, + "loss": 0.707, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.30478604752052557, + "learning_rate": 3.988313441862553e-05, + "loss": 0.7, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.2765974186524337, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7077, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.30696336878631286, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7149, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.2693060484483783, + "learning_rate": 3.741892127592625e-05, + "loss": 0.6853, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.2996865559401467, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.744, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.32116367983000393, + "learning_rate": 3.580982498073344e-05, + "loss": 0.736, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.30953157227898026, + "learning_rate": 3.501565286440914e-05, + "loss": 0.6969, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.32878634968048037, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7321, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3058791861937667, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7205, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.31626313050720906, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7064, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.33087842953319835, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7387, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3103197856204972, + "learning_rate": 3.115196713638e-05, + "loss": 0.7006, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.3892646864117474, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7486, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3265850973989887, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7401, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.30929124848146605, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6481, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.36052434727246496, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7153, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.32077054306044706, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.69, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.2942321680000018, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.6921, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.366812231700345, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7179, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2976605054353928, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7097, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.2806071151500792, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6906, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.29862145430358444, + "learning_rate": 2.399319354583418e-05, + "loss": 0.6859, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.31848989816101214, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7626, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3175131736732565, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7436, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.28853504741657543, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.6933, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.32076762021357175, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7019, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.34196822163816115, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.6907, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.3134739522586932, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7212, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3518835608545312, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7932, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.33140803770709104, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7868, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3565207578988336, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7389, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.30549965924216654, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6764, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.29213626540883963, + "learning_rate": 1.707039794428259e-05, + "loss": 0.7273, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.2980038332160969, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6846, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.300210767086982, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.667, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.29777208684669104, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.6726, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.32989535704655576, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7441, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.3527085657083696, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7425, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3413606436658763, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7133, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.34174194853533874, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.707, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.30750666981171837, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.6718, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.34851966567817805, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7525, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.3430438314400539, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6865, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.34507491001563145, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7272, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3067889589850218, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7442, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.32827798904559324, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7176, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.40722093804907067, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7319, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.2876035229559397, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6754, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.30978808958198306, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7003, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.29665723847819137, + "learning_rate": 8.532975781620512e-06, + "loss": 0.6846, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3242757800911127, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7033, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3077986435730653, + "learning_rate": 7.711881868390291e-06, + "loss": 0.708, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.35347342436745144, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7094, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3074112596076014, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6347, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.2885076062887646, + "learning_rate": 6.555246550469907e-06, + "loss": 0.6771, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.32272553689543476, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7111, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.310886031194646, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7692, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.292492219987561, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6892, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.2868717907909984, + "learning_rate": 5.154805790456485e-06, + "loss": 0.6572, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.30933088783540064, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7281, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3204501856072015, + "learning_rate": 4.516017865659949e-06, + "loss": 0.672, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3173471684890607, + "learning_rate": 4.21210590215273e-06, + "loss": 0.742, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.2820215009364713, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7136, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.33732261505341615, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.6868, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.30613483894533183, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.6549, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.33200651204689885, + "learning_rate": 3.100425628282899e-06, + "loss": 0.6965, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.317578222945079, + "learning_rate": 2.848647830172024e-06, + "loss": 0.707, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.3353103698687747, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7123, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.33880236092959853, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.6995, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.31986137813310067, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6933, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.33061247165083923, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6693, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.29596791807317174, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.6963, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.30279235472694177, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7168, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3508660189931767, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7781, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.34720284637704274, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6795, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3069150363317921, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7026, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.33833933003093686, + "learning_rate": 9.130206350089765e-07, + "loss": 0.6799, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.3123949902112611, + "learning_rate": 7.781338686584927e-07, + "loss": 0.6992, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3649115379085674, + "learning_rate": 6.539842600603918e-07, + "loss": 0.6437, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3187056613092533, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7325, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.37774427223289525, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7433, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.30360889754672465, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6937, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.36622826874025366, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7688, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.29793100657713634, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7004, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.32667040098315064, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7172, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.2774095648074841, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6632, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.36101836423025013, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7271, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.28363398532541456, + "learning_rate": 2.164213936770576e-08, + "loss": 0.6835, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3101541265795471, + "learning_rate": 5.410681219286673e-09, + "loss": 0.673, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4525513676593632, + "learning_rate": 0.0, + "loss": 0.7941, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 786873517342720.0, + "train_loss": 0.7704597816635401, + "train_runtime": 9554.0421, + "train_samples_per_second": 1.047, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 786873517342720.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..23a3e536e974f1eaaf496d7257728d54b167649d --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "k_proj", + "gate_proj", + "up_proj", + "o_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2c6ac7dd93432a68c489b02d96b5a1445c5a6d31 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615e942fb4821335cea29abdf1b9f617266efbbef026514a2aecd3f9106caa9e +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..5d318c828d80516011d93fee452cb99f234e50c1 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ace8b7e231a47f824654d09f66cc1bb791af3142f0e069bd42bb59062967cb5d +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f30f002e9c734dcc89180ac3c07469166621cc10 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 1.0015174620485605, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4793, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9876415696365153, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3539, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8871311223104527, + "learning_rate": 3.157894736842105e-05, + "loss": 1.4285, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8886476879769438, + "learning_rate": 4.210526315789474e-05, + "loss": 1.4661, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.7508929168773036, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2084, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8976026372242618, + "learning_rate": 6.31578947368421e-05, + "loss": 1.2591, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.9424845533668358, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1513, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.455853155223566, + "learning_rate": 8.421052631578948e-05, + "loss": 1.1339, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.8693405772483687, + "learning_rate": 9.473684210526316e-05, + "loss": 1.0139, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.7654137947938375, + "learning_rate": 0.00010526315789473685, + "loss": 1.0142, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6451888198745874, + "learning_rate": 0.00011578947368421053, + "loss": 0.921, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5674718053393812, + "learning_rate": 0.0001263157894736842, + "loss": 0.9655, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6338107894244854, + "learning_rate": 0.0001368421052631579, + "loss": 0.9404, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5970437433211514, + "learning_rate": 0.00014736842105263158, + "loss": 0.9946, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.5386631750608524, + "learning_rate": 0.00015789473684210527, + "loss": 0.9299, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6098521133390008, + "learning_rate": 0.00016842105263157895, + "loss": 0.8836, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5526053541676853, + "learning_rate": 0.00017894736842105264, + "loss": 0.8614, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5621891595051264, + "learning_rate": 0.00018947368421052632, + "loss": 0.9052, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.4766365170854893, + "learning_rate": 0.0002, + "loss": 0.8847, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.5840529147180359, + "learning_rate": 0.00019999865623437013, + "loss": 0.9846, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5547468909516751, + "learning_rate": 0.00019999462497359466, + "loss": 0.9126, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5236711341322138, + "learning_rate": 0.00019998790632601496, + "loss": 0.9088, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5134909604364669, + "learning_rate": 0.0001999785004721968, + "loss": 0.922, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.9697300427476122, + "learning_rate": 0.00019996640766492543, + "loss": 0.9295, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.5444734161267164, + "learning_rate": 0.00019995162822919883, + "loss": 0.8188, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5285720378875282, + "learning_rate": 0.00019993416256221895, + "loss": 0.9296, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5472043933513683, + "learning_rate": 0.00019991401113338104, + "loss": 0.8499, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5731832933338967, + "learning_rate": 0.00019989117448426108, + "loss": 0.8745, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5860201677194361, + "learning_rate": 0.00019986565322860115, + "loss": 0.9126, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.5533538251539861, + "learning_rate": 0.00019983744805229296, + "loss": 0.9087, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5203941717384726, + "learning_rate": 0.00019980655971335945, + "loss": 0.8501, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5830666219014088, + "learning_rate": 0.00019977298904193437, + "loss": 0.9825, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.6611403135288123, + "learning_rate": 0.00019973673694024, + "loss": 0.8221, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5509131235427106, + "learning_rate": 0.00019969780438256293, + "loss": 0.9468, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.48262614802977466, + "learning_rate": 0.0001996561924152278, + "loss": 0.885, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4967468079606848, + "learning_rate": 0.0001996119021565693, + "loss": 0.8773, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.5054682556817377, + "learning_rate": 0.0001995649347969019, + "loss": 0.8839, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5783100473180441, + "learning_rate": 0.00019951529159848805, + "loss": 0.8111, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.5091982204899665, + "learning_rate": 0.00019946297389550433, + "loss": 0.9338, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.48714751969431214, + "learning_rate": 0.00019940798309400526, + "loss": 0.8598, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5905380509640572, + "learning_rate": 0.0001993503206718859, + "loss": 0.9708, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4517410539140721, + "learning_rate": 0.00019928998817884182, + "loss": 0.7965, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5897811049658183, + "learning_rate": 0.00019922698723632767, + "loss": 0.9407, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6449325148181358, + "learning_rate": 0.00019916131953751342, + "loss": 0.8267, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.590279216662507, + "learning_rate": 0.00019909298684723904, + "loss": 0.8075, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4803036071450405, + "learning_rate": 0.00019902199100196697, + "loss": 0.8232, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.650057906490265, + "learning_rate": 0.00019894833390973266, + "loss": 0.8773, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5248713216167825, + "learning_rate": 0.00019887201755009357, + "loss": 0.7718, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.640458383752968, + "learning_rate": 0.0001987930439740757, + "loss": 0.9686, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.5541359099271482, + "learning_rate": 0.00019871141530411853, + "loss": 0.9041, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5028891773577449, + "learning_rate": 0.0001986271337340182, + "loss": 0.8902, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4272704300386147, + "learning_rate": 0.00019854020152886814, + "loss": 0.7603, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.47583155154421025, + "learning_rate": 0.0001984506210249986, + "loss": 0.8069, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5709391164161426, + "learning_rate": 0.00019835839462991361, + "loss": 0.9319, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.4817867624586776, + "learning_rate": 0.00019826352482222638, + "loss": 0.9246, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.468122073430705, + "learning_rate": 0.00019816601415159263, + "loss": 0.835, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.48146205808220116, + "learning_rate": 0.0001980658652386421, + "loss": 0.8406, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4293599866988802, + "learning_rate": 0.00019796308077490817, + "loss": 0.7913, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.513833564071936, + "learning_rate": 0.00019785766352275542, + "loss": 0.8971, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.5306753517830095, + "learning_rate": 0.00019774961631530545, + "loss": 0.8591, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.5352719795282737, + "learning_rate": 0.00019763894205636072, + "loss": 0.8931, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.46783532766005226, + "learning_rate": 0.00019752564372032657, + "loss": 0.8904, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4614025887933628, + "learning_rate": 0.00019740972435213115, + "loss": 0.8454, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5292160356256126, + "learning_rate": 0.00019729118706714375, + "loss": 0.8513, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.4982992250084739, + "learning_rate": 0.00019717003505109095, + "loss": 0.8386, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.48428581778363466, + "learning_rate": 0.00019704627155997108, + "loss": 0.8086, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5073401208837878, + "learning_rate": 0.00019691989991996663, + "loss": 0.8522, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4801419636570018, + "learning_rate": 0.0001967909235273549, + "loss": 0.8365, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5221625549443353, + "learning_rate": 0.00019665934584841682, + "loss": 0.8845, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.5986785798900521, + "learning_rate": 0.00019652517041934356, + "loss": 0.955, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.48615409615097854, + "learning_rate": 0.00019638840084614182, + "loss": 0.8108, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.6451006868580349, + "learning_rate": 0.00019624904080453655, + "loss": 0.9381, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.45118744899163543, + "learning_rate": 0.00019610709403987246, + "loss": 0.7676, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4340630088345874, + "learning_rate": 0.00019596256436701324, + "loss": 0.8363, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.4992414504376428, + "learning_rate": 0.000195815455670239, + "loss": 0.8589, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.45489056952933793, + "learning_rate": 0.00019566577190314197, + "loss": 0.7578, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.462574375461729, + "learning_rate": 0.0001955135170885202, + "loss": 0.8144, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.44677457011919625, + "learning_rate": 0.00019535869531826937, + "loss": 0.7371, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5300906125839625, + "learning_rate": 0.00019520131075327298, + "loss": 0.7211, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.4956868841920524, + "learning_rate": 0.00019504136762329047, + "loss": 0.857, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.478050340938763, + "learning_rate": 0.00019487887022684336, + "loss": 0.7653, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.506827815430912, + "learning_rate": 0.00019471382293110003, + "loss": 0.8293, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4453021459427925, + "learning_rate": 0.00019454623017175812, + "loss": 0.7946, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5009642065830214, + "learning_rate": 0.00019437609645292546, + "loss": 0.8618, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.43906905180332645, + "learning_rate": 0.0001942034263469989, + "loss": 0.7575, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.41950432980358227, + "learning_rate": 0.00019402822449454153, + "loss": 0.7369, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5016133929837371, + "learning_rate": 0.00019385049560415794, + "loss": 0.8835, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.516494245891528, + "learning_rate": 0.00019367024445236754, + "loss": 0.8725, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.5021999595004972, + "learning_rate": 0.00019348747588347637, + "loss": 0.7533, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.4430566994249469, + "learning_rate": 0.00019330219480944694, + "loss": 0.7624, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4509312202200304, + "learning_rate": 0.00019311440620976597, + "loss": 0.799, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.43880539468467566, + "learning_rate": 0.0001929241151313108, + "loss": 0.8197, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.5132542928814076, + "learning_rate": 0.00019273132668821364, + "loss": 0.7823, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5245855062966562, + "learning_rate": 0.00019253604606172417, + "loss": 0.7357, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.4947356222564879, + "learning_rate": 0.00019233827850007027, + "loss": 0.7901, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5166934209160752, + "learning_rate": 0.00019213802931831696, + "loss": 0.8242, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.567129098577025, + "learning_rate": 0.00019193530389822363, + "loss": 0.7955, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.49886841034517554, + "learning_rate": 0.00019173010768809933, + "loss": 0.8322, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.5001197133372923, + "learning_rate": 0.0001915224462026563, + "loss": 0.8714, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.43228199460060496, + "learning_rate": 0.00019131232502286188, + "loss": 0.8307, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.448189800279342, + "learning_rate": 0.0001910997497957885, + "loss": 0.8203, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5166084063249642, + "learning_rate": 0.00019088472623446183, + "loss": 0.9382, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.42395231697251196, + "learning_rate": 0.00019066726011770726, + "loss": 0.7309, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5325728930102084, + "learning_rate": 0.0001904473572899947, + "loss": 0.7919, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.5810813596160656, + "learning_rate": 0.00019022502366128135, + "loss": 0.8665, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5178420018432477, + "learning_rate": 0.00019000026520685302, + "loss": 0.7748, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.43174207765650063, + "learning_rate": 0.0001897730879671634, + "loss": 0.8587, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5084392269746427, + "learning_rate": 0.00018954349804767184, + "loss": 0.8812, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4938158911376374, + "learning_rate": 0.00018931150161867916, + "loss": 0.7739, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.4209234076367087, + "learning_rate": 0.00018907710491516199, + "loss": 0.8116, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4902464756683709, + "learning_rate": 0.0001888403142366049, + "loss": 0.8293, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4029217813145927, + "learning_rate": 0.00018860113594683148, + "loss": 0.7758, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5027079156271373, + "learning_rate": 0.00018835957647383303, + "loss": 0.8489, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.46448340749276795, + "learning_rate": 0.00018811564230959588, + "loss": 0.8098, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.44950359161598746, + "learning_rate": 0.00018786934000992688, + "loss": 0.7572, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.490597035755986, + "learning_rate": 0.00018762067619427746, + "loss": 0.8055, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5056506525939135, + "learning_rate": 0.00018736965754556528, + "loss": 0.7864, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.48824543659451236, + "learning_rate": 0.00018711629080999504, + "loss": 0.826, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.438338905865828, + "learning_rate": 0.00018686058279687698, + "loss": 0.8237, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.45226308155612466, + "learning_rate": 0.00018660254037844388, + "loss": 0.8023, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4794240733462027, + "learning_rate": 0.00018634217048966637, + "loss": 0.7136, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.455112467805177, + "learning_rate": 0.0001860794801280666, + "loss": 0.8448, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.554454372998596, + "learning_rate": 0.0001858144763535302, + "loss": 0.8949, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.40719407499070376, + "learning_rate": 0.0001855471662881164, + "loss": 0.7724, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.49565188196879706, + "learning_rate": 0.00018527755711586678, + "loss": 0.7619, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5315727228630976, + "learning_rate": 0.00018500565608261214, + "loss": 0.8732, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5278973926598879, + "learning_rate": 0.00018473147049577774, + "loss": 0.8674, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5451926259797549, + "learning_rate": 0.00018445500772418697, + "loss": 0.9044, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4471628586141413, + "learning_rate": 0.00018417627519786315, + "loss": 0.8245, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.5099190212606985, + "learning_rate": 0.00018389528040783012, + "loss": 0.8499, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.4842632782791375, + "learning_rate": 0.00018361203090591071, + "loss": 0.7982, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.41845249804793744, + "learning_rate": 0.00018332653430452376, + "loss": 0.7219, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4521500741887537, + "learning_rate": 0.00018303879827647975, + "loss": 0.9326, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.549396544660062, + "learning_rate": 0.00018274883055477436, + "loss": 0.8468, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.4250544447837251, + "learning_rate": 0.00018245663893238075, + "loss": 0.7971, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.49064918738470137, + "learning_rate": 0.00018216223126204007, + "loss": 0.832, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.463827496584346, + "learning_rate": 0.00018186561545605054, + "loss": 0.854, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4597855238764686, + "learning_rate": 0.00018156679948605467, + "loss": 0.78, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4914738603082715, + "learning_rate": 0.00018126579138282503, + "loss": 0.795, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.4926261389560048, + "learning_rate": 0.0001809625992360485, + "loss": 0.8305, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5031856664703104, + "learning_rate": 0.00018065723119410884, + "loss": 0.8475, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.45706982680396824, + "learning_rate": 0.00018034969546386757, + "loss": 0.7866, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4301082418624238, + "learning_rate": 0.0001800400003104436, + "loss": 0.7872, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.44829906702649863, + "learning_rate": 0.00017972815405699103, + "loss": 0.8394, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.45893221626141645, + "learning_rate": 0.00017941416508447536, + "loss": 0.8098, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.46227383900189783, + "learning_rate": 0.0001790980418314484, + "loss": 0.8534, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.489231355056065, + "learning_rate": 0.00017877979279382135, + "loss": 0.8027, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4365612758393478, + "learning_rate": 0.0001784594265246366, + "loss": 0.7776, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.42303435003406964, + "learning_rate": 0.0001781369516338378, + "loss": 0.7209, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.5323466461183587, + "learning_rate": 0.00017781237678803847, + "loss": 0.8451, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.44172675899358993, + "learning_rate": 0.000177485710710289, + "loss": 0.7089, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3987636549147191, + "learning_rate": 0.00017715696217984235, + "loss": 0.7322, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.4987262959398162, + "learning_rate": 0.00017682614003191807, + "loss": 0.8479, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.40802324497108594, + "learning_rate": 0.00017649325315746478, + "loss": 0.7477, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.44260849482811404, + "learning_rate": 0.0001761583105029213, + "loss": 0.8477, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.45152048928671507, + "learning_rate": 0.00017582132106997616, + "loss": 0.7607, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.3727117850944028, + "learning_rate": 0.00017548229391532572, + "loss": 0.7461, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4346644018549827, + "learning_rate": 0.00017514123815043074, + "loss": 0.768, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.6387377791538981, + "learning_rate": 0.00017479816294127152, + "loss": 0.7941, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.46738874826437676, + "learning_rate": 0.0001744530775081015, + "loss": 0.7664, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5469311399482338, + "learning_rate": 0.0001741059911251997, + "loss": 0.9073, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.459477644555679, + "learning_rate": 0.000173756913120621, + "loss": 0.8536, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5008793140691554, + "learning_rate": 0.00017340585287594604, + "loss": 0.9077, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4014856961750562, + "learning_rate": 0.0001730528198260285, + "loss": 0.7414, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.4156727906348157, + "learning_rate": 0.00017269782345874203, + "loss": 0.7879, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.524723299497917, + "learning_rate": 0.00017234087331472497, + "loss": 0.8268, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.42793136149893307, + "learning_rate": 0.00017198197898712404, + "loss": 0.7652, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5827078181581881, + "learning_rate": 0.00017162115012133643, + "loss": 0.8992, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5770024054009357, + "learning_rate": 0.00017125839641475072, + "loss": 0.8866, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.46303670018289184, + "learning_rate": 0.00017089372761648616, + "loss": 0.7922, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.43144009507026043, + "learning_rate": 0.00017052715352713075, + "loss": 0.7332, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4676567443973931, + "learning_rate": 0.00017015868399847768, + "loss": 0.7909, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4518971099838989, + "learning_rate": 0.00016978832893326074, + "loss": 0.8271, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.502779035397551, + "learning_rate": 0.00016941609828488807, + "loss": 0.7827, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.4739995975160528, + "learning_rate": 0.0001690420020571747, + "loss": 0.7485, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5806028445201168, + "learning_rate": 0.0001686660503040737, + "loss": 0.8176, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4440921268581451, + "learning_rate": 0.00016828825312940592, + "loss": 0.7396, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5464928579487258, + "learning_rate": 0.0001679086206865886, + "loss": 0.8075, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.518382011831501, + "learning_rate": 0.00016752716317836229, + "loss": 0.8169, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.48733722711706706, + "learning_rate": 0.0001671438908565167, + "loss": 0.8649, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.474150897982231, + "learning_rate": 0.00016675881402161536, + "loss": 0.7129, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5739178426992518, + "learning_rate": 0.0001663719430227186, + "loss": 0.8426, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.5405118513170662, + "learning_rate": 0.00016598328825710533, + "loss": 0.8424, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4091725361226266, + "learning_rate": 0.000165592860169994, + "loss": 0.7383, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.4941518676866939, + "learning_rate": 0.00016520066925426144, + "loss": 0.7698, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.460410852033095, + "learning_rate": 0.0001648067260501611, + "loss": 0.8312, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4414176713439921, + "learning_rate": 0.0001644110411450398, + "loss": 0.8052, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4144602985262704, + "learning_rate": 0.00016401362517305296, + "loss": 0.7652, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5133854428463844, + "learning_rate": 0.00016361448881487914, + "loss": 0.7949, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.5017733040045176, + "learning_rate": 0.00016321364279743266, + "loss": 0.8006, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.5070132146678253, + "learning_rate": 0.0001628110978935756, + "loss": 0.8106, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.5024867833341402, + "learning_rate": 0.00016240686492182804, + "loss": 0.7785, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.4484189925977328, + "learning_rate": 0.00016200095474607753, + "loss": 0.7487, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.47324489960943145, + "learning_rate": 0.00016159337827528685, + "loss": 0.8092, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.46644172129515227, + "learning_rate": 0.0001611841464632011, + "loss": 0.7696, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4445964562034957, + "learning_rate": 0.0001607732703080532, + "loss": 0.749, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.44053163752870483, + "learning_rate": 0.00016036076085226814, + "loss": 0.6821, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.44006988291084115, + "learning_rate": 0.0001599466291821666, + "loss": 0.8123, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.46162021021553556, + "learning_rate": 0.0001595308864276666, + "loss": 0.7466, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.4861509641478491, + "learning_rate": 0.0001591135437619847, + "loss": 0.7303, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.8113002127550668, + "learning_rate": 0.0001586946124013354, + "loss": 0.8212, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4412783697302041, + "learning_rate": 0.0001582741036046301, + "loss": 0.7124, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4009577314704931, + "learning_rate": 0.00015785202867317407, + "loss": 0.7361, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4836043807102253, + "learning_rate": 0.00015742839895036305, + "loss": 0.6946, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.5510487709140544, + "learning_rate": 0.00015700322582137827, + "loss": 0.8482, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3948986077066301, + "learning_rate": 0.0001565765207128805, + "loss": 0.7756, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4454833183647007, + "learning_rate": 0.0001561482950927029, + "loss": 0.6966, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.44071084413902345, + "learning_rate": 0.00015571856046954285, + "loss": 0.7498, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.46124858780311434, + "learning_rate": 0.00015528732839265272, + "loss": 0.7398, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.5221312959740511, + "learning_rate": 0.0001548546104515294, + "loss": 0.8639, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.527156104993205, + "learning_rate": 0.00015442041827560274, + "loss": 0.8172, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.39619485323325615, + "learning_rate": 0.00015398476353392323, + "loss": 0.7167, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.44701220142882203, + "learning_rate": 0.00015354765793484834, + "loss": 0.7817, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4272688101955897, + "learning_rate": 0.00015310911322572753, + "loss": 0.757, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.47077908804199275, + "learning_rate": 0.000152669141192587, + "loss": 0.723, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5195474745442558, + "learning_rate": 0.00015222775365981273, + "loss": 0.8476, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4407026058813082, + "learning_rate": 0.00015178496248983254, + "loss": 0.7508, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.5789322923328071, + "learning_rate": 0.00015134077958279765, + "loss": 0.8146, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5501834951591267, + "learning_rate": 0.00015089521687626243, + "loss": 0.8141, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.5671286578471947, + "learning_rate": 0.000150448286344864, + "loss": 0.8022, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.45955116950959923, + "learning_rate": 0.00015000000000000001, + "loss": 0.8235, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4183893271328529, + "learning_rate": 0.00014955036988950618, + "loss": 0.804, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4018444295412617, + "learning_rate": 0.00014909940809733222, + "loss": 0.7728, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3855479422960728, + "learning_rate": 0.00014864712674321734, + "loss": 0.7062, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.44530270924587134, + "learning_rate": 0.00014819353798236427, + "loss": 0.8019, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4274637217340129, + "learning_rate": 0.00014773865400511272, + "loss": 0.7553, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.5473912939885323, + "learning_rate": 0.00014728248703661182, + "loss": 0.8066, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.48002498604463145, + "learning_rate": 0.00014682504933649144, + "loss": 0.7501, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.49341673598237357, + "learning_rate": 0.00014636635319853275, + "loss": 0.8, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.47316118480038, + "learning_rate": 0.00014590641095033787, + "loss": 0.7333, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.36381074635090893, + "learning_rate": 0.00014544523495299842, + "loss": 0.7522, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4578934599507819, + "learning_rate": 0.0001449828376007636, + "loss": 0.7607, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.4275844193572388, + "learning_rate": 0.0001445192313207067, + "loss": 0.7397, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.547941164326141, + "learning_rate": 0.0001440544285723915, + "loss": 0.8038, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.4294673298613944, + "learning_rate": 0.00014358844184753712, + "loss": 0.7054, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4206285330078151, + "learning_rate": 0.00014312128366968243, + "loss": 0.7842, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.3973909966495677, + "learning_rate": 0.00014265296659384956, + "loss": 0.7248, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.445338802308087, + "learning_rate": 0.00014218350320620624, + "loss": 0.6854, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.43967054769693525, + "learning_rate": 0.0001417129061237278, + "loss": 0.7839, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.5125341807543686, + "learning_rate": 0.00014124118799385796, + "loss": 0.8355, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.48474309555396194, + "learning_rate": 0.00014076836149416887, + "loss": 0.8598, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4664265460912641, + "learning_rate": 0.0001402944393320206, + "loss": 0.7469, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4662157993678711, + "learning_rate": 0.00013981943424421932, + "loss": 0.7687, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4957693519995061, + "learning_rate": 0.00013934335899667527, + "loss": 0.8233, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.43547855134293173, + "learning_rate": 0.00013886622638405952, + "loss": 0.7524, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.39986545231882026, + "learning_rate": 0.00013838804922946027, + "loss": 0.7214, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.43736721625590835, + "learning_rate": 0.00013790884038403795, + "loss": 0.7195, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.41678788946271195, + "learning_rate": 0.00013742861272668012, + "loss": 0.7049, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.5097479119844933, + "learning_rate": 0.00013694737916365517, + "loss": 0.8609, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.5219257677032542, + "learning_rate": 0.00013646515262826552, + "loss": 0.8485, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.5731982429069994, + "learning_rate": 0.0001359819460805001, + "loss": 0.8518, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4670563574293175, + "learning_rate": 0.0001354977725066859, + "loss": 0.7679, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4618462877156586, + "learning_rate": 0.00013501264491913906, + "loss": 0.8021, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.43635999776213674, + "learning_rate": 0.0001345265763558152, + "loss": 0.8576, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.42260696560466565, + "learning_rate": 0.00013403957987995882, + "loss": 0.7485, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4644266252543304, + "learning_rate": 0.0001335516685797525, + "loss": 0.8067, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.48179710380261775, + "learning_rate": 0.00013306285556796495, + "loss": 0.8687, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.41539921332562196, + "learning_rate": 0.00013257315398159864, + "loss": 0.6865, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.40719931133459253, + "learning_rate": 0.00013208257698153677, + "loss": 0.7393, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.5014877269699531, + "learning_rate": 0.00013159113775218964, + "loss": 0.7463, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4364647270317339, + "learning_rate": 0.00013109884950114007, + "loss": 0.785, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.44651786022049156, + "learning_rate": 0.00013060572545878875, + "loss": 0.743, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4489730804175122, + "learning_rate": 0.00013011177887799845, + "loss": 0.7673, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.456792277856867, + "learning_rate": 0.00012961702303373795, + "loss": 0.7405, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.4137202400333378, + "learning_rate": 0.00012912147122272523, + "loss": 0.7288, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.41595808890975944, + "learning_rate": 0.00012862513676307008, + "loss": 0.7253, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.4078746657180001, + "learning_rate": 0.00012812803299391628, + "loss": 0.7226, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.48736207830058065, + "learning_rate": 0.00012763017327508305, + "loss": 0.7571, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.4647000749119668, + "learning_rate": 0.0001271315709867059, + "loss": 0.7615, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.4105994274631069, + "learning_rate": 0.00012663223952887723, + "loss": 0.8328, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.40839766721449233, + "learning_rate": 0.00012613219232128608, + "loss": 0.6947, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4823030145018645, + "learning_rate": 0.00012563144280285741, + "loss": 0.8035, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3840383966747989, + "learning_rate": 0.00012513000443139112, + "loss": 0.7476, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.45375744146113045, + "learning_rate": 0.00012462789068320017, + "loss": 0.7732, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.4322322503914143, + "learning_rate": 0.00012412511505274844, + "loss": 0.7601, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.43814929438507033, + "learning_rate": 0.00012362169105228826, + "loss": 0.781, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5190464537109442, + "learning_rate": 0.000123117632211497, + "loss": 0.7936, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.44274623078075137, + "learning_rate": 0.00012261295207711346, + "loss": 0.7882, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4488098560364818, + "learning_rate": 0.0001221076642125742, + "loss": 0.7131, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.433049679759616, + "learning_rate": 0.00012160178219764837, + "loss": 0.7035, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.39746170986200574, + "learning_rate": 0.00012109531962807332, + "loss": 0.727, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.390316648352027, + "learning_rate": 0.00012058829011518896, + "loss": 0.7329, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.5461119127144377, + "learning_rate": 0.00012008070728557186, + "loss": 0.7774, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.37948324279727014, + "learning_rate": 0.00011957258478066931, + "loss": 0.7489, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.4129850534536828, + "learning_rate": 0.00011906393625643244, + "loss": 0.7478, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.37358709629956094, + "learning_rate": 0.00011855477538294935, + "loss": 0.7256, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4030533874194643, + "learning_rate": 0.00011804511584407763, + "loss": 0.7531, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.40792951134610655, + "learning_rate": 0.00011753497133707679, + "loss": 0.7589, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4494647090693021, + "learning_rate": 0.00011702435557223987, + "loss": 0.7628, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.40488625838702924, + "learning_rate": 0.00011651328227252517, + "loss": 0.7828, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.44909496919489056, + "learning_rate": 0.00011600176517318741, + "loss": 0.7729, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4533143891301691, + "learning_rate": 0.00011548981802140848, + "loss": 0.8096, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.534450701226846, + "learning_rate": 0.00011497745457592816, + "loss": 0.8171, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4048228043574203, + "learning_rate": 0.00011446468860667421, + "loss": 0.6978, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.4250809091760721, + "learning_rate": 0.00011395153389439233, + "loss": 0.7161, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4493275958933995, + "learning_rate": 0.00011343800423027582, + "loss": 0.7077, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.36313082010522685, + "learning_rate": 0.0001129241134155949, + "loss": 0.7488, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.8067070103111783, + "learning_rate": 0.00011240987526132594, + "loss": 0.823, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4732468862594865, + "learning_rate": 0.00011189530358778005, + "loss": 0.795, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.4929348065341102, + "learning_rate": 0.00011138041222423177, + "loss": 0.7082, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.44341789397499826, + "learning_rate": 0.00011086521500854745, + "loss": 0.8003, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4867929464096527, + "learning_rate": 0.00011034972578681338, + "loss": 0.8053, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.4384093233663122, + "learning_rate": 0.00010983395841296348, + "loss": 0.7734, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5702832504318212, + "learning_rate": 0.00010931792674840718, + "loss": 0.7345, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.39132879436845286, + "learning_rate": 0.00010880164466165674, + "loss": 0.6696, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4934264866325654, + "learning_rate": 0.00010828512602795462, + "loss": 0.7325, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4603780809347021, + "learning_rate": 0.00010776838472890065, + "loss": 0.7395, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.46640500803625956, + "learning_rate": 0.00010725143465207867, + "loss": 0.7166, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3909709155701661, + "learning_rate": 0.00010673428969068364, + "loss": 0.7246, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.465947545486099, + "learning_rate": 0.00010621696374314807, + "loss": 0.7073, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3799746763745742, + "learning_rate": 0.00010569947071276847, + "loss": 0.7366, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5134298034724935, + "learning_rate": 0.00010518182450733186, + "loss": 0.8144, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.41525626685746175, + "learning_rate": 0.00010466403903874176, + "loss": 0.7505, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5349137667265772, + "learning_rate": 0.00010414612822264455, + "loss": 0.7829, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.411967581572483, + "learning_rate": 0.00010362810597805526, + "loss": 0.6906, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.40928029419771667, + "learning_rate": 0.0001031099862269837, + "loss": 0.7107, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4230622141851907, + "learning_rate": 0.00010259178289406011, + "loss": 0.7622, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.49519974665945765, + "learning_rate": 0.00010207350990616107, + "loss": 0.6816, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.450165136834805, + "learning_rate": 0.0001015551811920351, + "loss": 0.7455, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.42307727725626, + "learning_rate": 0.00010103681068192845, + "loss": 0.7468, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4045647312806453, + "learning_rate": 0.00010051841230721065, + "loss": 0.6947, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4735590110789679, + "learning_rate": 0.0001, + "loss": 0.8264, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.5044907920990858, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7254, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.35767594936525043, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7072, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.42213819388136953, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7865, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5148095210861435, + "learning_rate": 9.792649009383899e-05, + "loss": 0.8567, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4085537065981769, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6186, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.46419483428766023, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6963, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4487462892431637, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7467, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.36919109547608464, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6777, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.4332804643362148, + "learning_rate": 9.533596096125825e-05, + "loss": 0.8242, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.476007591449096, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6951, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.39042388236881836, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6984, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4568185090445949, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7992, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.379082207436869, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.41974138322610066, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7113, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.5327757442213293, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7839, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.46462763671394586, + "learning_rate": 9.171487397204539e-05, + "loss": 0.747, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.4237837783253501, + "learning_rate": 9.119835533834331e-05, + "loss": 0.679, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.47716642768101986, + "learning_rate": 9.068207325159284e-05, + "loss": 0.8039, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3990031991734295, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7181, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.43282253577010726, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7476, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.46147540239235874, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7725, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.6306841195084987, + "learning_rate": 8.861958777576827e-05, + "loss": 0.791, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.44299697883912553, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7714, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4742086986588113, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7892, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.40386184053331575, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7201, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3943880217838005, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7554, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.47769776472851405, + "learning_rate": 8.604846610560771e-05, + "loss": 0.8727, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.4309249235815185, + "learning_rate": 8.553531139332582e-05, + "loss": 0.675, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.4968939254696869, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7007, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.581901884314562, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7262, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.47410879725161503, + "learning_rate": 8.399823482681262e-05, + "loss": 0.796, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.5017773554214965, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7131, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.4827622442496861, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6633, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.47634020407804994, + "learning_rate": 8.246502866292324e-05, + "loss": 0.788, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.38496982587382605, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6744, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5585107304142405, + "learning_rate": 8.144522461705067e-05, + "loss": 0.8145, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5340891376397738, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7253, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.4069800636168623, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7443, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4322847059956011, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7448, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4037556172005087, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7598, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.42088050249324677, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7613, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.40703844656525856, + "learning_rate": 7.839821780235168e-05, + "loss": 0.7048, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.48121410465951564, + "learning_rate": 7.789233578742582e-05, + "loss": 0.8222, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4596219784863138, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7658, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.38319275608875203, + "learning_rate": 7.688236778850306e-05, + "loss": 0.69, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.38323886885120834, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7025, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.45770876533845695, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7418, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.46141885369023555, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7805, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.4594229161181626, + "learning_rate": 7.48699955686089e-05, + "loss": 0.8136, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4688572983621563, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7967, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4267776312967211, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7141, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.42227533245492355, + "learning_rate": 7.336776047112276e-05, + "loss": 0.755, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.45567834015494074, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7297, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.8492105997024391, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7793, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.34574383083982096, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6215, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.37546278553434914, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7651, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.43367219836477805, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6847, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.3765877233648976, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6522, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.37197353852559156, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6724, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5893490383226295, + "learning_rate": 6.939427454121128e-05, + "loss": 0.9073, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4147043351321972, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7421, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.43929000502257476, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6555, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.5101682678182368, + "learning_rate": 6.791742301846326e-05, + "loss": 0.8062, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.36406036685719256, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7238, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.39799113361555244, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7179, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.39315786057172425, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7108, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.43881095481650995, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7995, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.41286048761294303, + "learning_rate": 6.547342364418481e-05, + "loss": 0.8183, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.38009599010775147, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7429, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.37823264328365747, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7629, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4229441576338051, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7849, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4836979525548324, + "learning_rate": 6.35348473717345e-05, + "loss": 0.8376, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.5008828679160982, + "learning_rate": 6.305262083634488e-05, + "loss": 0.8903, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.42445687492843887, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7238, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3806642176576031, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7443, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4138456130919022, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7149, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.3815781184842867, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6988, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.38969147637983625, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6967, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4092585102597334, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7169, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4432734726452317, + "learning_rate": 5.970556066797941e-05, + "loss": 0.704, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.39276989933160955, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6722, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.47489068681029745, + "learning_rate": 5.875881200614207e-05, + "loss": 0.7481, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.4298128905708399, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6799, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.5096331770598059, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7565, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.41638459245578724, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7637, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5261378797450693, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7515, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.3974344319305451, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6764, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.3977459350094927, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7321, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4677168900264908, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7825, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4408638164696113, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6959, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.49118497531075295, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7222, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.43683565263329055, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7531, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.4367885778809987, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7301, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.39642825524746744, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7276, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4015203362935855, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7293, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.44012206651588054, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6704, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4528152261382792, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7571, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.4232884321834926, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6692, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4589892785877267, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7233, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4121692638871751, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6903, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.4974120424904181, + "learning_rate": 5.000000000000002e-05, + "loss": 0.8023, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.35332028323560105, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6874, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.5184079608375904, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.69, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5010633649261649, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7363, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4882482668086425, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7833, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.39104905325649925, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7307, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.530495265099178, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7411, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.4895216806828033, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7393, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.4187151068051059, + "learning_rate": 4.645234206515171e-05, + "loss": 0.742, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5769953743315126, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.806, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.37923197810456, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7264, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4117013354698191, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6537, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.4866274176306346, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7663, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.37932179125869436, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6625, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.6437435262608248, + "learning_rate": 4.385170490729712e-05, + "loss": 0.8709, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4226429641505177, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6482, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4129173694423031, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.8046, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.4402767096651791, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7119, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3933043200442359, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7576, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.39368578737069154, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6905, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4252957814217015, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6766, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5245243270238202, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7349, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.4445948767256047, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7471, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.41184138635514905, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6516, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3658960420810612, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7316, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4431033651023181, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6836, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.36265698022749926, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6612, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.4611539094672187, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7611, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.35399277493822134, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6796, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.39985353378296284, + "learning_rate": 3.759313507817196e-05, + "loss": 0.685, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4091364829563213, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.765, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4004115744612513, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7099, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.36773690341408966, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6552, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.456949920489654, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.8078, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.42208864825015796, + "learning_rate": 3.558895885496023e-05, + "loss": 0.731, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.43779177036677636, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6511, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.40841748301998027, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7144, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.42406716442802694, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7453, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4280874157412337, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7338, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4119513245583709, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7045, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.45724210670950644, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7179, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.405964538237769, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6856, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.4527419433584937, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7454, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4802116690936312, + "learning_rate": 3.209137931341143e-05, + "loss": 0.721, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.39418840743948563, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6637, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.44283330912589575, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.733, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.576205073795686, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7945, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.5340403767407518, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6905, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.45573412321129403, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7426, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4619023384747589, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7297, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4167715788679408, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.666, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5297008170441028, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6242, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.43398173481967217, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7218, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.464824555470651, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6911, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4210661005022877, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6994, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.48277212401999947, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6729, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.40948887298085435, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.715, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.42391394456952675, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6595, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.4237343785261066, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6642, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.5415525593262541, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7566, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.38128734164176914, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6635, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.42449619629861135, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7446, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.41246628676655095, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7489, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.37092692640773334, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6306, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3501301527113845, + "learning_rate": 2.451770608467432e-05, + "loss": 0.703, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.45176747297015113, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6592, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.45908283704016833, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7617, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.39489988126749265, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7538, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5291265836161576, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7862, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.34729379078982947, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6901, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.38199809974385573, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6841, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.38819616099637727, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6942, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.43991208587073005, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7332, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.39666762145102386, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6615, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.4363503316776636, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6502, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.42029988863876855, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7245, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4407884192788708, + "learning_rate": 2.058583491552465e-05, + "loss": 0.7472, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.4435916356849103, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6827, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.5220657409120679, + "learning_rate": 1.995999968955641e-05, + "loss": 0.8304, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.38719546300280033, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7387, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4630988967754445, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7507, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4814085621451986, + "learning_rate": 1.903740076395151e-05, + "loss": 0.8103, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.516539394331081, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7419, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.49268882102769035, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7238, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.36707797580532886, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6922, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.46391162167829475, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6503, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.39943678330719323, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6815, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.41533716224726513, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7624, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.45240632629368666, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7307, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.38785608034491204, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6346, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.48034518746902743, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6384, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.41332667504283876, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6829, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.4337494393162549, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6848, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.40607918521508257, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6542, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.35771532616714113, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6839, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5254334891702237, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7895, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.5118367019902981, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7985, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.5483547105729336, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6695, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.435548249274326, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.655, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4599163753660314, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7608, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.47529799400955497, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6728, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4475815935368909, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7271, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.40024151914975997, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6488, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4049672840482492, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6829, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.485994698610914, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7746, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.40804461396074054, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7168, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.44647625717321937, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6938, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.4597697470689429, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6639, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.48485579308178456, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7607, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4194608779936511, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6819, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.40211362916556903, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.741, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.43258769474831466, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7362, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.4665960330004546, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.7407, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.43638629170945437, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6793, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.41166650165988555, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7082, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4976658974941119, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7379, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.40432300480341915, + "learning_rate": 9.774976338718677e-06, + "loss": 0.669, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.3847586744856044, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6718, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.48610078187730693, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7641, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3442919441790375, + "learning_rate": 9.115273765538202e-06, + "loss": 0.627, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.40165443953952795, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6798, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4111172074189193, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6745, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.42550186302416637, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7142, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4198048455350389, + "learning_rate": 8.269892311900696e-06, + "loss": 0.682, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.42105356184842474, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7032, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.38694532652504987, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6954, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.4346602726592264, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7521, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.42171087258624906, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6507, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.37950491418682564, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6733, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.42228264645197383, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5811, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.42265676405283753, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6716, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.366032607113853, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6674, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.4273846569885087, + "learning_rate": 6.512524116523633e-06, + "loss": 0.7321, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.41439846077578496, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6743, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.4800472014547638, + "learning_rate": 6.149504395842087e-06, + "loss": 0.8113, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4170099701429586, + "learning_rate": 5.971775505458444e-06, + "loss": 0.7094, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.42684358092676894, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7315, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.36759838949620705, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6375, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.39182460906574307, + "learning_rate": 5.453769828241872e-06, + "loss": 0.635, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.37155113388771127, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6679, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4892313200523182, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7773, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3614705577221202, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6662, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.40056040990097586, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6835, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.48693182831639875, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6551, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.43409098562506476, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6583, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.44222427351456456, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.8126, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.40367878960302256, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7328, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.3878452948297235, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6766, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.4460323004358859, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6327, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.41623353636179367, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7224, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4414137781565002, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6397, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.39615571862232873, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6526, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.42988070037803233, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6705, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4011019198453032, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7098, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.4526937752135543, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6943, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.414733452155201, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7004, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.49479347833939974, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6822, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.4196809553904605, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7237, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4433795918252207, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6647, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.4730290908266958, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.726, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.41578174368248944, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6727, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.47562360950917576, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7028, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.43046163526997777, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6867, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4386045894305353, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6415, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4250490242198921, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6778, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.43396467538084366, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.7061, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.41093749413933983, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6798, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.42716345262559524, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.7352, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.5342245884546438, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.7963, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4869935685121873, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7474, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.5377492034909038, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7281, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.40252067300632566, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6188, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.4732759934224106, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.7283, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.36939415621243193, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6588, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4768166229427656, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6875, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.40826207503606476, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6603, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4342070575118136, + "learning_rate": 9.070131527609604e-07, + "loss": 0.7266, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.46573911518925726, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6629, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.4163287933190186, + "learning_rate": 7.730127636723539e-07, + "loss": 0.633, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4599398479149586, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6399, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.4208391764061762, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7828, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3866556900143404, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6665, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.4383711039390692, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6948, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.6069140919286845, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7724, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4134702599835318, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6775, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.39534120734604034, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6894, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4500504918539995, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7333, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.526748092187756, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.7924, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3870731710657465, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6952, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.40571545726534053, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6893, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4575999319147385, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6865, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4799332271789479, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.729, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.5887484421927903, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6209, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.43168284858047784, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6994, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.539889322027179, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6928, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4984417364968895, + "learning_rate": 6.583743778106887e-08, + "loss": 0.7475, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.39370007146273023, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7119, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.38284303572252104, + "learning_rate": 3.359233507459481e-08, + "loss": 0.641, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4101442442522563, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6702, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.40298946757857557, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6648, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4983330016825017, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7611, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.5495717532741172, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.816, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.454551235327257, + "learning_rate": 0.0, + "loss": 0.7579, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 543415710547968.0, + "train_loss": 0.7690219638824463, + "train_runtime": 9697.6766, + "train_samples_per_second": 1.031, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 543415710547968.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d9b6b576818b64d18ee005ecc9a68eeb6922180d --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "o_proj", + "k_proj", + "gate_proj", + "v_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..df1e3f17de7aaa6d2bc673852858fc75a3cafc79 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49193811350fceac6ff35d7a8e2c44d86e2897dd1908ea8688102ba6c4dd96d6 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..aee3bfefd48577574a87cb32a867d54e9e47eb88 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29d8d2b5eddb734787584954dba701ce75115d53d4b55cfe642afbb06871cc4f +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..59eb9c8ce67a02aa78b6f3284df17cde60cc7fdd --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_10000_repeat_3_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9106693837526286, + "learning_rate": 2e-05, + "loss": 1.4166, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9150250397204797, + "learning_rate": 4e-05, + "loss": 1.4921, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7355679850665631, + "learning_rate": 6e-05, + "loss": 1.3632, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7891276176019939, + "learning_rate": 8e-05, + "loss": 1.3311, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.8090098490379414, + "learning_rate": 0.0001, + "loss": 1.1543, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.8151074320199535, + "learning_rate": 0.00012, + "loss": 1.0282, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.809346750846133, + "learning_rate": 0.00014, + "loss": 1.063, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6372252258219455, + "learning_rate": 0.00016, + "loss": 0.9674, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.47471192891119973, + "learning_rate": 0.00018, + "loss": 0.9224, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.43416804788081265, + "learning_rate": 0.0002, + "loss": 0.969, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.41593483043885393, + "learning_rate": 0.00019999458931878073, + "loss": 0.947, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.46251156589792036, + "learning_rate": 0.0001999783578606323, + "loss": 0.9553, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.42505280208450474, + "learning_rate": 0.00019995130738201966, + "loss": 0.9056, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.498402240030992, + "learning_rate": 0.0001999134408101731, + "loss": 0.9016, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.4916638726847002, + "learning_rate": 0.00019986476224277165, + "loss": 0.9454, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.49106508322501824, + "learning_rate": 0.00019980527694749952, + "loss": 0.9412, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.43278849589900825, + "learning_rate": 0.00019973499136147606, + "loss": 0.8991, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4526138687244381, + "learning_rate": 0.0001996539130905593, + "loss": 0.8968, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.37678340559461915, + "learning_rate": 0.0001995620509085228, + "loss": 0.861, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.39240358043942264, + "learning_rate": 0.00019945941475610623, + "loss": 0.9177, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.40094288576049547, + "learning_rate": 0.0001993460157399396, + "loss": 0.8959, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.45673105078652454, + "learning_rate": 0.0001992218661313415, + "loss": 0.8908, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.3926631168011837, + "learning_rate": 0.00019908697936499103, + "loss": 0.8175, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.45952477937088554, + "learning_rate": 0.00019894137003747403, + "loss": 0.8398, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.43896138205695384, + "learning_rate": 0.00019878505390570362, + "loss": 0.9402, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3406810043098025, + "learning_rate": 0.00019861804788521493, + "loss": 0.8302, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.39297163285135195, + "learning_rate": 0.00019844037004833473, + "loss": 0.8725, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.36185852199663304, + "learning_rate": 0.00019825203962222572, + "loss": 0.8859, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.3106879438825548, + "learning_rate": 0.0001980530769868059, + "loss": 0.818, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.37724029656618624, + "learning_rate": 0.00019784350367254322, + "loss": 0.888, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.35228625590220625, + "learning_rate": 0.0001976233423581255, + "loss": 0.8856, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.37806390979081067, + "learning_rate": 0.0001973926168680066, + "loss": 0.8491, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.35078254894013106, + "learning_rate": 0.00019715135216982798, + "loss": 0.8164, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.37913950320639267, + "learning_rate": 0.0001968995743717171, + "loss": 0.8466, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.4181875377614008, + "learning_rate": 0.00019663731071946206, + "loss": 0.9186, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4043945986637868, + "learning_rate": 0.00019636458959356316, + "loss": 0.8783, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3174107328080895, + "learning_rate": 0.0001960814405061619, + "loss": 0.8019, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3345640199753624, + "learning_rate": 0.00019578789409784727, + "loss": 0.7977, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3339024057811489, + "learning_rate": 0.00019548398213434007, + "loss": 0.7785, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.4680186115901849, + "learning_rate": 0.00019516973750305532, + "loss": 0.78, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3503475171946669, + "learning_rate": 0.00019484519420954354, + "loss": 0.7886, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.33065456175873204, + "learning_rate": 0.00019451038737381077, + "loss": 0.8181, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.30800754949755055, + "learning_rate": 0.00019416535322651818, + "loss": 0.7451, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.36298560164696553, + "learning_rate": 0.00019381012910506146, + "loss": 0.8762, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.3287578419703281, + "learning_rate": 0.00019344475344953012, + "loss": 0.7494, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3375297717853042, + "learning_rate": 0.00019306926579854821, + "loss": 0.8056, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3607363104653381, + "learning_rate": 0.00019268370678499533, + "loss": 0.7577, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.3253489154409118, + "learning_rate": 0.0001922881181316097, + "loss": 0.7929, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.35589811863739607, + "learning_rate": 0.00019188254264647337, + "loss": 0.8088, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.35912939752872436, + "learning_rate": 0.0001914670242183795, + "loss": 0.8456, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.3600222012087073, + "learning_rate": 0.0001910416078120832, + "loss": 0.8688, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.32996798554134926, + "learning_rate": 0.0001906063394634356, + "loss": 0.7576, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.36570201256321944, + "learning_rate": 0.00019016126627440237, + "loss": 0.8121, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3379751853927354, + "learning_rate": 0.00018970643640796642, + "loss": 0.8599, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.3262085269268287, + "learning_rate": 0.000189241899082916, + "loss": 0.7844, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.31957254739098756, + "learning_rate": 0.00018876770456851877, + "loss": 0.7951, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3558840401659398, + "learning_rate": 0.0001882839041790818, + "loss": 0.8208, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.33425963122126895, + "learning_rate": 0.00018779055026839868, + "loss": 0.7737, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3694001379431872, + "learning_rate": 0.00018728769622408423, + "loss": 0.798, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.32885346367760754, + "learning_rate": 0.00018677539646179707, + "loss": 0.8075, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3285771099087272, + "learning_rate": 0.00018625370641935129, + "loss": 0.7653, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.35925157593697005, + "learning_rate": 0.00018572268255071718, + "loss": 0.8244, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.38310318033177926, + "learning_rate": 0.00018518238231991218, + "loss": 0.8022, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.38208895577179514, + "learning_rate": 0.00018463286419478255, + "loss": 0.8795, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.34205407357191786, + "learning_rate": 0.00018407418764067627, + "loss": 0.8261, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3335723562628723, + "learning_rate": 0.00018350641311400812, + "loss": 0.7516, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.39029161117617034, + "learning_rate": 0.0001829296020557174, + "loss": 0.8797, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3186738975446756, + "learning_rate": 0.00018234381688461942, + "loss": 0.797, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3492691721803383, + "learning_rate": 0.0001817491209906506, + "loss": 0.8084, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.3386299809469852, + "learning_rate": 0.00018114557872800905, + "loss": 0.8003, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3432083736446383, + "learning_rate": 0.00018053325540819045, + "loss": 0.8036, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.32411203608189576, + "learning_rate": 0.0001799122172929206, + "loss": 0.8045, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.34205019992181324, + "learning_rate": 0.00017928253158698473, + "loss": 0.8177, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3393413929561874, + "learning_rate": 0.0001786442664309554, + "loss": 0.7829, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.3403930731634467, + "learning_rate": 0.0001779974908938184, + "loss": 0.7705, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4563705848356069, + "learning_rate": 0.0001773422749654988, + "loss": 0.7094, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3356557868672747, + "learning_rate": 0.00017667868954928694, + "loss": 0.7877, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3343961782832242, + "learning_rate": 0.00017600680645416583, + "loss": 0.799, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.29896123475559366, + "learning_rate": 0.00017532669838704035, + "loss": 0.7427, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.3884174599267879, + "learning_rate": 0.00017463843894486937, + "loss": 0.7676, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3762609083923659, + "learning_rate": 0.0001739421026067017, + "loss": 0.8712, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3238605671110351, + "learning_rate": 0.00017323776472561627, + "loss": 0.815, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3318596057413296, + "learning_rate": 0.00017252550152056795, + "loss": 0.7978, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.37648246871592933, + "learning_rate": 0.0001718053900681397, + "loss": 0.8206, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.3609318128948997, + "learning_rate": 0.00017107750829420176, + "loss": 0.8217, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3240938832354287, + "learning_rate": 0.00017034193496547902, + "loss": 0.7493, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.46518441453936255, + "learning_rate": 0.00016959874968102735, + "loss": 0.7919, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3885763765239273, + "learning_rate": 0.00016884803286362, + "loss": 0.7677, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.34957619797079137, + "learning_rate": 0.00016808986575104465, + "loss": 0.7629, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.33173392274808255, + "learning_rate": 0.00016732433038731242, + "loss": 0.8256, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4146598743193586, + "learning_rate": 0.0001665515096137797, + "loss": 0.7728, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3292399124225849, + "learning_rate": 0.00016577148706018328, + "loss": 0.776, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3278418203942441, + "learning_rate": 0.00016498434713559088, + "loss": 0.7878, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.2936962503701319, + "learning_rate": 0.00016419017501926656, + "loss": 0.7713, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.37106134444980204, + "learning_rate": 0.0001633890566514535, + "loss": 0.7824, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3591030974883882, + "learning_rate": 0.00016258107872407375, + "loss": 0.7817, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3467533106037834, + "learning_rate": 0.0001617663286713474, + "loss": 0.7725, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.32975563402977487, + "learning_rate": 0.00016094489466033043, + "loss": 0.7484, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.31496869619076157, + "learning_rate": 0.00016011686558137448, + "loss": 0.7379, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.3402749041335752, + "learning_rate": 0.0001592823310385073, + "loss": 0.7306, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.31941766632295754, + "learning_rate": 0.0001584413813397364, + "loss": 0.7572, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3015192715759972, + "learning_rate": 0.00015759410748727662, + "loss": 0.7023, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.32148409942770195, + "learning_rate": 0.00015674060116770236, + "loss": 0.7991, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.290285665482951, + "learning_rate": 0.00015588095474202595, + "loss": 0.7163, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.3610296010151392, + "learning_rate": 0.00015501526123570277, + "loss": 0.792, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.32500368638370364, + "learning_rate": 0.00015414361432856475, + "loss": 0.7554, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.2940274549940774, + "learning_rate": 0.0001532661083446829, + "loss": 0.7591, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.34867458636709453, + "learning_rate": 0.00015238283824216015, + "loss": 0.7721, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.40554825556846064, + "learning_rate": 0.00015149389960285558, + "loss": 0.7677, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.3644334745678942, + "learning_rate": 0.00015059938862204127, + "loss": 0.7893, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3226736096226785, + "learning_rate": 0.00014969940209799248, + "loss": 0.7982, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.29905498065898395, + "learning_rate": 0.00014879403742151283, + "loss": 0.7288, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4616258882615652, + "learning_rate": 0.00014788339256539544, + "loss": 0.7713, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.38088529191812087, + "learning_rate": 0.0001469675660738206, + "loss": 0.772, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.3568898489498515, + "learning_rate": 0.00014604665705169237, + "loss": 0.7564, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.2865326409362873, + "learning_rate": 0.00014512076515391375, + "loss": 0.7457, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.35191316566879094, + "learning_rate": 0.00014418999057460276, + "loss": 0.7629, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3047226349467024, + "learning_rate": 0.0001432544340362501, + "loss": 0.7326, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.29800211432506263, + "learning_rate": 0.00014231419677881966, + "loss": 0.6982, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.34687981717363264, + "learning_rate": 0.00014136938054879283, + "loss": 0.8012, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3306311760889097, + "learning_rate": 0.00014042008758815818, + "loss": 0.7926, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3406212491870728, + "learning_rate": 0.00013946642062334766, + "loss": 0.7864, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.32377838938476877, + "learning_rate": 0.00013850848285411994, + "loss": 0.7276, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.29897427831595963, + "learning_rate": 0.000137546377942393, + "loss": 0.703, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.37386473627609185, + "learning_rate": 0.00013658021000102636, + "loss": 0.8466, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3698269446125909, + "learning_rate": 0.00013561008358255468, + "loss": 0.8009, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3368127495556458, + "learning_rate": 0.00013463610366787392, + "loss": 0.8202, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3237476463764862, + "learning_rate": 0.00013365837565488064, + "loss": 0.7675, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3281958141162337, + "learning_rate": 0.0001326770053470668, + "loss": 0.7686, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.36031942199209543, + "learning_rate": 0.0001316920989420703, + "loss": 0.7342, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3157432648234459, + "learning_rate": 0.00013070376302018287, + "loss": 0.7574, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.34776666895501035, + "learning_rate": 0.00012971210453281674, + "loss": 0.7506, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3012211162298138, + "learning_rate": 0.000128717230790931, + "loss": 0.7199, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3232381306031035, + "learning_rate": 0.00012771924945341906, + "loss": 0.729, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.368877912773277, + "learning_rate": 0.00012671826851545851, + "loss": 0.7859, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.34015263865850554, + "learning_rate": 0.0001257143962968246, + "loss": 0.7422, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.303778036696223, + "learning_rate": 0.00012470774143016853, + "loss": 0.7562, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.30744559225466855, + "learning_rate": 0.00012369841284926188, + "loss": 0.7674, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3451312795488895, + "learning_rate": 0.00012268651977720866, + "loss": 0.7793, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.4812586743513979, + "learning_rate": 0.00012167217171462566, + "loss": 0.6997, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.2795763029314522, + "learning_rate": 0.0001206554784277931, + "loss": 0.721, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.33010817156052036, + "learning_rate": 0.00011963654993677645, + "loss": 0.7574, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4017483091261422, + "learning_rate": 0.00011861549650352069, + "loss": 0.7281, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.2873340263031428, + "learning_rate": 0.00011759242861991855, + "loss": 0.7483, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.29693018943938276, + "learning_rate": 0.00011656745699585371, + "loss": 0.7638, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3193708762335958, + "learning_rate": 0.00011554069254722051, + "loss": 0.7897, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3467588383503259, + "learning_rate": 0.00011451224638392129, + "loss": 0.7431, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3201402192283141, + "learning_rate": 0.00011348222979784289, + "loss": 0.7126, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.449068735615328, + "learning_rate": 0.00011245075425081328, + "loss": 0.7841, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.3493031254413651, + "learning_rate": 0.00011141793136253986, + "loss": 0.7467, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3369590279859377, + "learning_rate": 0.0001103838728985307, + "loss": 0.7988, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5187049166800942, + "learning_rate": 0.000109348690758, + "loss": 0.7486, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3091547818218103, + "learning_rate": 0.00010831249696175918, + "loss": 0.6941, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3280449119692185, + "learning_rate": 0.0001072754036400944, + "loss": 0.7202, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.3167129579508421, + "learning_rate": 0.00010623752302063283, + "loss": 0.7113, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.34194146462229297, + "learning_rate": 0.00010519896741619803, + "loss": 0.7672, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3394190438831376, + "learning_rate": 0.00010415984921265609, + "loss": 0.7664, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.2866043239924286, + "learning_rate": 0.00010312028085675391, + "loss": 0.692, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3193779930417426, + "learning_rate": 0.00010208037484395114, + "loss": 0.7195, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.3036615572504567, + "learning_rate": 0.00010104024370624644, + "loss": 0.7385, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3211627469185093, + "learning_rate": 0.0001, + "loss": 0.7513, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.30853424987560296, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7098, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.34048566014865345, + "learning_rate": 9.791962515604887e-05, + "loss": 0.8106, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3555812501300445, + "learning_rate": 9.687971914324607e-05, + "loss": 0.6588, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.30003114018974325, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7057, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.34098331573546814, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7539, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.31843957799356415, + "learning_rate": 9.376247697936719e-05, + "loss": 0.744, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.2827599722340685, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7057, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.35937660615313516, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7613, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.32390203597909806, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7375, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.2982931570036195, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7293, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.37475104123017583, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7729, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.400425482560844, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7764, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.30693636509591, + "learning_rate": 8.651777020215712e-05, + "loss": 0.735, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.3312361648741088, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7684, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.342117677610307, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7056, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4026302548599891, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7536, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.36524291372842144, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7166, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.34879105119066595, + "learning_rate": 8.138450349647936e-05, + "loss": 0.741, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.35691755313184303, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7353, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3063866868885555, + "learning_rate": 7.934452157220694e-05, + "loss": 0.7477, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3354965106858531, + "learning_rate": 7.832782828537437e-05, + "loss": 0.7281, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3438439178838029, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7948, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.2813875656620238, + "learning_rate": 7.630158715073813e-05, + "loss": 0.696, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.32537325052888916, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7581, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3969564770900358, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8019, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.31388734051820155, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7345, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3741797502750705, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7549, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.25861948653068545, + "learning_rate": 7.1282769209069e-05, + "loss": 0.695, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.2857777460426956, + "learning_rate": 7.028789546718326e-05, + "loss": 0.6682, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.35310800733438846, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7848, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.34819520216697436, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6982, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.32569889775815364, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7654, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.2945547780260103, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7129, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.31955365316009615, + "learning_rate": 6.536389633212609e-05, + "loss": 0.8102, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.2761627197268354, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7524, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3480882847368592, + "learning_rate": 6.341978999897365e-05, + "loss": 0.8154, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.35497213312331755, + "learning_rate": 6.245362205760704e-05, + "loss": 0.8084, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.29073504801842387, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7297, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3006796917539789, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6998, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.31029757927940815, + "learning_rate": 5.957991241184184e-05, + "loss": 0.715, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3162419517676427, + "learning_rate": 5.863061945120719e-05, + "loss": 0.712, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.31445541520912884, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7179, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3572593415712166, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7542, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.28509856829528524, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.7056, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.340773802794312, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7473, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.3427328039402282, + "learning_rate": 5.395334294830765e-05, + "loss": 0.7442, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.29642040414266996, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7347, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.2969940936232079, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7017, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.3065499410397904, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.718, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3225482706679557, + "learning_rate": 5.030059790200756e-05, + "loss": 0.707, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3117434011223896, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7452, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3388025296915813, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7173, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.33139733316731435, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7571, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 1.039675293634295, + "learning_rate": 4.673389165531714e-05, + "loss": 0.742, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3358206672686245, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7764, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.30238318926136615, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6912, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4638805485158481, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7157, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.385267310334133, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7601, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.313400211944292, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7585, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.2893788658587654, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7226, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3166132206984673, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7063, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.30630399306954587, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6999, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.2854119528504984, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7085, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.30178369626580587, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7154, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.27405217029411794, + "learning_rate": 3.741892127592625e-05, + "loss": 0.6868, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3429699375170305, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7437, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3896398683462038, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7372, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.31882873277957113, + "learning_rate": 3.501565286440914e-05, + "loss": 0.6966, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.30475473837546774, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7325, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3054589522030496, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7201, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3073347601274914, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7077, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.39566012301677733, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7389, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.28697664729194006, + "learning_rate": 3.115196713638e-05, + "loss": 0.7011, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.3943475964025582, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7491, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.33464501189479606, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.74, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.30655764106334443, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6499, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3390622200008982, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7135, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.32370061204685463, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.6916, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.3102019687302034, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.6939, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3650056108389055, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7155, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.34275722823771554, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7095, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.2867086174953593, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6926, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3751058033742623, + "learning_rate": 2.399319354583418e-05, + "loss": 0.6866, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.32868973447271876, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7616, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.32031865758716654, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7433, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.28430902291291127, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.6928, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.30244143016278335, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.703, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.35639534135164314, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.6909, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.3192293375780227, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7211, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3498635928667987, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7929, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4134395803571079, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7873, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3450344604974233, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7368, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.2969607355615406, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6761, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.2891440442732942, + "learning_rate": 1.707039794428259e-05, + "loss": 0.7284, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.30147698742460943, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6855, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.29579967648966576, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6673, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.29733957118580456, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.6737, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3218007344725993, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7439, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.3491367262180285, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7418, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3213424044123325, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7144, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.35003192439763947, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7077, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.2949878045047657, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.672, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.30356393661543046, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7541, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.33072054793836925, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6849, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3268536790375609, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7249, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.30520956028690377, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7456, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3357206914705883, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7142, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5448399393246839, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7308, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.28860683769411816, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6767, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3100248803449033, + "learning_rate": 8.958392187916841e-06, + "loss": 0.6999, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.3327885931301477, + "learning_rate": 8.532975781620512e-06, + "loss": 0.6844, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3138077626194803, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7036, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3243992230675338, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7094, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.32531144597392186, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7082, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.2924185207467151, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6355, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.29206907180124286, + "learning_rate": 6.555246550469907e-06, + "loss": 0.6772, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.2973521290026205, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7106, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3145736899748929, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7677, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.3157293519172516, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6904, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.2870726912108204, + "learning_rate": 5.154805790456485e-06, + "loss": 0.6586, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.30906828128304625, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7286, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3182529993154354, + "learning_rate": 4.516017865659949e-06, + "loss": 0.6741, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.32179537551574167, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7388, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.2810823397409509, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7148, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.32333801256970945, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.6885, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.33254138632936137, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.6545, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3343083470572911, + "learning_rate": 3.100425628282899e-06, + "loss": 0.6984, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3194235142724928, + "learning_rate": 2.848647830172024e-06, + "loss": 0.708, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.32418757353426036, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7122, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.33992621355635866, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.6996, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3226877235528863, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6956, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.3171383430893597, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6694, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.290782512230605, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7004, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.3263108357933877, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7165, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3634629078675466, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7765, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3377377234300096, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6796, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3065727226496677, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7016, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.31834091430652933, + "learning_rate": 9.130206350089765e-07, + "loss": 0.6788, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.3292102282324205, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7016, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.31793484496480684, + "learning_rate": 6.539842600603918e-07, + "loss": 0.6436, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.31122498533982473, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7317, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.38086901286840796, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7416, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.29772767023521746, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6916, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.3532109944946722, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7684, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.27767796371449827, + "learning_rate": 1.947230525005006e-07, + "loss": 0.701, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.32597557938540084, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7163, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.27544315888032217, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6651, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3697743104755361, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7284, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.2806673046158917, + "learning_rate": 2.164213936770576e-08, + "loss": 0.6828, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.30407061552813863, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6736, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3850573291540645, + "learning_rate": 0.0, + "loss": 0.7934, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 786873517342720.0, + "train_loss": 0.7706633139497194, + "train_runtime": 9555.5288, + "train_samples_per_second": 1.047, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 786873517342720.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..350597a9a3f2f55f635f1b6a22363f469e3114c8 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "up_proj", + "k_proj", + "down_proj", + "v_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e9baa578b5762262fbf12a880ab00f06136c0034 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0756bf3eef72821dbb19ca341b1fc1f85af345bfdb07bae896a6839f50499c53 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..f52eae82bd669af732ef02c0e91f92d85a94a515 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb1c893384f7ee2d85fa351f4f4003c8e1569154f490c2023da29d9b714ba839 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9dae2d6fe729f0b36168a515787908873b7fd4ec --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.9738718905282882, + "learning_rate": 5.263157894736842e-06, + "loss": 1.3776, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.9955982617174386, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.3593, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 1.0292629278624237, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.4638, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8691424278818122, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2431, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.9261316600857898, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.3732, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.9065393240134191, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3584, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.7144291097146667, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.0929, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9158998506068002, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1723, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.777438611058757, + "learning_rate": 4.736842105263158e-05, + "loss": 1.0773, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 1.314136422443474, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.1588, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.8692683248594849, + "learning_rate": 5.789473684210527e-05, + "loss": 1.0121, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7621402947276783, + "learning_rate": 6.31578947368421e-05, + "loss": 0.9942, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8434274430077191, + "learning_rate": 6.842105263157895e-05, + "loss": 1.0715, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7245209141052413, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9671, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.8256935204711093, + "learning_rate": 7.894736842105263e-05, + "loss": 0.9891, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6234866086859933, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9121, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.6670638916913187, + "learning_rate": 8.947368421052632e-05, + "loss": 0.9312, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5685270034758133, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8739, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.5662279425727015, + "learning_rate": 0.0001, + "loss": 0.9169, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.5321164781312352, + "learning_rate": 0.00010526315789473685, + "loss": 0.8862, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6452928193096523, + "learning_rate": 0.0001105263157894737, + "loss": 0.939, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5889563887372142, + "learning_rate": 0.00011578947368421053, + "loss": 0.9265, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5305268661758419, + "learning_rate": 0.00012105263157894738, + "loss": 0.8958, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.4374645727689325, + "learning_rate": 0.0001263157894736842, + "loss": 0.8491, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.6265051901301006, + "learning_rate": 0.00013157894736842108, + "loss": 0.8892, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.544618577492362, + "learning_rate": 0.0001368421052631579, + "loss": 0.8929, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.5020223053534353, + "learning_rate": 0.00014210526315789474, + "loss": 0.9309, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.6790382740530038, + "learning_rate": 0.00014736842105263158, + "loss": 0.9323, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.6384546114121064, + "learning_rate": 0.00015263157894736845, + "loss": 1.0031, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.5551995120063944, + "learning_rate": 0.00015789473684210527, + "loss": 0.8097, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.5615200663677613, + "learning_rate": 0.0001631578947368421, + "loss": 0.8294, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.47874175026665533, + "learning_rate": 0.00016842105263157895, + "loss": 0.9338, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.5645765986458627, + "learning_rate": 0.0001736842105263158, + "loss": 0.8836, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.49163986066497906, + "learning_rate": 0.00017894736842105264, + "loss": 0.8248, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.5207241540650344, + "learning_rate": 0.00018421052631578948, + "loss": 0.8682, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5186393720573987, + "learning_rate": 0.00018947368421052632, + "loss": 0.9449, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.4900931272240655, + "learning_rate": 0.00019473684210526317, + "loss": 0.8828, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.4816533288557279, + "learning_rate": 0.0002, + "loss": 0.8889, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.5158896150522022, + "learning_rate": 0.00019999966405802826, + "loss": 0.8074, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.5850365403221622, + "learning_rate": 0.00019999865623437013, + "loss": 0.8836, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.4701167474534878, + "learning_rate": 0.00019999697653579705, + "loss": 0.904, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5549098311815541, + "learning_rate": 0.00019999462497359466, + "loss": 0.8947, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.4648186239868878, + "learning_rate": 0.0001999916015635627, + "loss": 0.7969, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6391364745948533, + "learning_rate": 0.00019998790632601496, + "loss": 0.8843, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.5076423625352378, + "learning_rate": 0.00019998353928577919, + "loss": 0.8081, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.49126687423074844, + "learning_rate": 0.0001999785004721968, + "loss": 0.8972, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.5017726655140315, + "learning_rate": 0.0001999727899191228, + "loss": 0.8712, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.549440628986534, + "learning_rate": 0.00019996640766492543, + "loss": 0.9312, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.5355737096496868, + "learning_rate": 0.00019995935375248606, + "loss": 0.878, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.44999051862035433, + "learning_rate": 0.00019995162822919883, + "loss": 0.8084, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.4900749507095087, + "learning_rate": 0.00019994323114697022, + "loss": 0.8316, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5117994847013144, + "learning_rate": 0.00019993416256221895, + "loss": 0.8608, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.5793930106305719, + "learning_rate": 0.0001999244225358753, + "loss": 0.8786, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.6203928016888752, + "learning_rate": 0.00019991401113338104, + "loss": 0.9021, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.4395642052217729, + "learning_rate": 0.00019990292842468868, + "loss": 0.804, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6327858480994747, + "learning_rate": 0.00019989117448426108, + "loss": 0.9353, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.43822231696617303, + "learning_rate": 0.0001998787493910712, + "loss": 0.7973, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5686770926878106, + "learning_rate": 0.00019986565322860115, + "loss": 0.7992, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.49620141131497186, + "learning_rate": 0.000199851886084842, + "loss": 0.8901, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.5336489874875604, + "learning_rate": 0.00019983744805229296, + "loss": 0.9136, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.46455062000355124, + "learning_rate": 0.00019982233922796085, + "loss": 0.8072, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5169487116900677, + "learning_rate": 0.00019980655971335945, + "loss": 0.8492, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.5561981975277991, + "learning_rate": 0.00019979010961450878, + "loss": 0.9055, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4812674310307808, + "learning_rate": 0.00019977298904193437, + "loss": 0.8738, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.5215732343292463, + "learning_rate": 0.00019975519811066663, + "loss": 0.8656, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5148244034689009, + "learning_rate": 0.00019973673694024, + "loss": 0.8197, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.4442934586215466, + "learning_rate": 0.0001997176056546921, + "loss": 0.7518, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.6135294026474913, + "learning_rate": 0.00019969780438256293, + "loss": 0.9257, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.4947662845524055, + "learning_rate": 0.0001996773332568941, + "loss": 0.8183, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.429456280197822, + "learning_rate": 0.0001996561924152278, + "loss": 0.7379, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.5542940527933353, + "learning_rate": 0.00019963438199960599, + "loss": 0.8827, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5532480090464903, + "learning_rate": 0.0001996119021565693, + "loss": 0.8782, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.4697101553880468, + "learning_rate": 0.00019958875303715615, + "loss": 0.8361, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4868559323245861, + "learning_rate": 0.0001995649347969019, + "loss": 0.827, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.596419404041506, + "learning_rate": 0.0001995404475958373, + "loss": 0.9036, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4534360086341495, + "learning_rate": 0.00019951529159848805, + "loss": 0.7968, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.45360606459520647, + "learning_rate": 0.0001994894669738732, + "loss": 0.734, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4719919379702672, + "learning_rate": 0.00019946297389550433, + "loss": 0.8371, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.5251165433982562, + "learning_rate": 0.0001994358125413841, + "loss": 0.8563, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.46847022350052253, + "learning_rate": 0.00019940798309400526, + "loss": 0.8448, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.508156193308429, + "learning_rate": 0.0001993794857403495, + "loss": 0.8555, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.39701579915605517, + "learning_rate": 0.0001993503206718859, + "loss": 0.7728, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.5398851920389962, + "learning_rate": 0.0001993204880845699, + "loss": 0.8473, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4746705891110577, + "learning_rate": 0.00019928998817884182, + "loss": 0.7767, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.5227545409963591, + "learning_rate": 0.00019925882115962568, + "loss": 0.8389, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5047015184751248, + "learning_rate": 0.00019922698723632767, + "loss": 0.8786, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.5487063304465213, + "learning_rate": 0.00019919448662283478, + "loss": 0.8583, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5406478590263915, + "learning_rate": 0.00019916131953751342, + "loss": 0.8345, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.4556906536085241, + "learning_rate": 0.00019912748620320794, + "loss": 0.8587, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.46457156727140836, + "learning_rate": 0.00019909298684723904, + "loss": 0.7514, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.4734733161717522, + "learning_rate": 0.00019905782170140238, + "loss": 0.7886, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.46010879443324293, + "learning_rate": 0.00019902199100196697, + "loss": 0.7652, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.4997404000177325, + "learning_rate": 0.00019898549498967343, + "loss": 0.8798, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4790654404778781, + "learning_rate": 0.00019894833390973266, + "loss": 0.8564, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.46309032055654636, + "learning_rate": 0.000198910508011824, + "loss": 0.8327, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.7213308815478991, + "learning_rate": 0.00019887201755009357, + "loss": 0.9652, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.5197579060478195, + "learning_rate": 0.00019883286278315262, + "loss": 0.7527, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5250384341038005, + "learning_rate": 0.0001987930439740757, + "loss": 0.8537, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.4679433715996057, + "learning_rate": 0.00019875256139039902, + "loss": 0.8266, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.519912535189573, + "learning_rate": 0.00019871141530411853, + "loss": 0.8234, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.4760641770357719, + "learning_rate": 0.00019866960599168826, + "loss": 0.8254, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4603282007871293, + "learning_rate": 0.0001986271337340182, + "loss": 0.8703, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.5473069069521743, + "learning_rate": 0.0001985839988164726, + "loss": 0.8951, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5206514832514219, + "learning_rate": 0.00019854020152886814, + "loss": 0.8368, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.42955480551815406, + "learning_rate": 0.00019849574216547171, + "loss": 0.7217, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5008905219462, + "learning_rate": 0.0001984506210249986, + "loss": 0.8121, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.47501177433724273, + "learning_rate": 0.00019840483841061058, + "loss": 0.781, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.49681252121417474, + "learning_rate": 0.00019835839462991361, + "loss": 0.818, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.5914157668396626, + "learning_rate": 0.00019831128999495606, + "loss": 0.911, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.46936059512369693, + "learning_rate": 0.00019826352482222638, + "loss": 0.8305, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.5234891664109574, + "learning_rate": 0.0001982150994326511, + "loss": 0.8048, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.49901884728932266, + "learning_rate": 0.00019816601415159263, + "loss": 0.7315, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.5235893588604486, + "learning_rate": 0.0001981162693088471, + "loss": 0.8104, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4953910108921592, + "learning_rate": 0.0001980658652386421, + "loss": 0.8195, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.4973017892932648, + "learning_rate": 0.0001980148022796345, + "loss": 0.7827, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.5023532567363218, + "learning_rate": 0.00019796308077490817, + "loss": 0.7809, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.48553556375331025, + "learning_rate": 0.00019791070107197153, + "loss": 0.8013, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4540201716573498, + "learning_rate": 0.00019785766352275542, + "loss": 0.8449, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.46022660068927584, + "learning_rate": 0.0001978039684836106, + "loss": 0.799, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.9659308169721341, + "learning_rate": 0.00019774961631530545, + "loss": 0.9912, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.4884710181225921, + "learning_rate": 0.0001976946073830234, + "loss": 0.7529, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4690345640057607, + "learning_rate": 0.00019763894205636072, + "loss": 0.8658, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.5420200831110911, + "learning_rate": 0.00019758262070932375, + "loss": 0.9115, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.43988541415262633, + "learning_rate": 0.00019752564372032657, + "loss": 0.787, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.49884742778224017, + "learning_rate": 0.00019746801147218842, + "loss": 0.8642, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.7783246401612559, + "learning_rate": 0.00019740972435213115, + "loss": 0.8818, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.46775227339587644, + "learning_rate": 0.00019735078275177654, + "loss": 0.811, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4463159134719736, + "learning_rate": 0.00019729118706714375, + "loss": 0.7858, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.43890830731604036, + "learning_rate": 0.00019723093769864663, + "loss": 0.7763, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.5169698296712399, + "learning_rate": 0.00019717003505109095, + "loss": 0.8259, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.7162596832908007, + "learning_rate": 0.0001971084795336719, + "loss": 0.7395, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4938875225187638, + "learning_rate": 0.00019704627155997108, + "loss": 0.7824, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.48281860188060755, + "learning_rate": 0.00019698341154795389, + "loss": 0.7633, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.552867873369316, + "learning_rate": 0.00019691989991996663, + "loss": 0.9141, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.4702107571476058, + "learning_rate": 0.00019685573710273376, + "loss": 0.782, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.6591282984364222, + "learning_rate": 0.0001967909235273549, + "loss": 0.9173, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.4801391907009083, + "learning_rate": 0.00019672545962930215, + "loss": 0.8243, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5655266450419159, + "learning_rate": 0.00019665934584841682, + "loss": 0.7905, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.5214781136343931, + "learning_rate": 0.00019659258262890683, + "loss": 0.8653, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.49152275049944777, + "learning_rate": 0.00019652517041934356, + "loss": 0.8119, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.521274832597228, + "learning_rate": 0.00019645710967265882, + "loss": 0.7943, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.47112952096647953, + "learning_rate": 0.00019638840084614182, + "loss": 0.7712, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.47792639731028924, + "learning_rate": 0.00019631904440143612, + "loss": 0.8015, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5031459552571963, + "learning_rate": 0.00019624904080453655, + "loss": 0.7739, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.4246137815285982, + "learning_rate": 0.00019617839052578603, + "loss": 0.6983, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.6104344906082924, + "learning_rate": 0.00019610709403987246, + "loss": 0.8464, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.4249619010799741, + "learning_rate": 0.0001960351518258255, + "loss": 0.7615, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5748627085289674, + "learning_rate": 0.00019596256436701324, + "loss": 0.7686, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.5137566844133159, + "learning_rate": 0.00019588933215113926, + "loss": 0.8338, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.5002092420518957, + "learning_rate": 0.000195815455670239, + "loss": 0.7562, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.47194552572414866, + "learning_rate": 0.00019574093542067673, + "loss": 0.7983, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5218303185521903, + "learning_rate": 0.00019566577190314197, + "loss": 0.7851, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.5264832068089446, + "learning_rate": 0.0001955899656226464, + "loss": 0.8371, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.478203962759533, + "learning_rate": 0.0001955135170885202, + "loss": 0.7616, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.5438026741934794, + "learning_rate": 0.0001954364268144088, + "loss": 0.7615, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.43902989895135003, + "learning_rate": 0.00019535869531826937, + "loss": 0.7639, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.47051131378806954, + "learning_rate": 0.00019528032312236736, + "loss": 0.7129, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5002910987291495, + "learning_rate": 0.00019520131075327298, + "loss": 0.71, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.4076232209792227, + "learning_rate": 0.00019512165874185767, + "loss": 0.7918, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.4730770037833421, + "learning_rate": 0.00019504136762329047, + "loss": 0.7892, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.45664931470747816, + "learning_rate": 0.0001949604379370345, + "loss": 0.7543, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.47178849520868527, + "learning_rate": 0.00019487887022684336, + "loss": 0.8437, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.467588792179776, + "learning_rate": 0.00019479666504075736, + "loss": 0.8021, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4972992438538472, + "learning_rate": 0.00019471382293110003, + "loss": 0.841, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.4468720005820029, + "learning_rate": 0.0001946303444544741, + "loss": 0.7794, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.545370771508763, + "learning_rate": 0.00019454623017175812, + "loss": 0.838, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.5454717084972628, + "learning_rate": 0.00019446148064810242, + "loss": 0.8629, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4357282147317954, + "learning_rate": 0.00019437609645292546, + "loss": 0.7779, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.48593556883421596, + "learning_rate": 0.00019429007815990993, + "loss": 0.807, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.41599332497007774, + "learning_rate": 0.0001942034263469989, + "loss": 0.7772, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.4836889364929825, + "learning_rate": 0.00019411614159639204, + "loss": 0.7779, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5252176172912414, + "learning_rate": 0.00019402822449454153, + "loss": 0.7382, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.597492267186273, + "learning_rate": 0.00019393967563214833, + "loss": 0.793, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.50643481417428, + "learning_rate": 0.00019385049560415794, + "loss": 0.8025, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.4863957284362853, + "learning_rate": 0.00019376068500975667, + "loss": 0.7685, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4126442552699547, + "learning_rate": 0.00019367024445236754, + "loss": 0.7851, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.45326416402953634, + "learning_rate": 0.000193579174539646, + "loss": 0.7749, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4227596656395421, + "learning_rate": 0.00019348747588347637, + "loss": 0.7664, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.42788815992693824, + "learning_rate": 0.00019339514909996706, + "loss": 0.7915, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.6288877822241705, + "learning_rate": 0.00019330219480944694, + "loss": 0.8891, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.4507238893173025, + "learning_rate": 0.00019320861363646095, + "loss": 0.7759, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.47300214317734723, + "learning_rate": 0.00019311440620976597, + "loss": 0.8498, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.4401017701118483, + "learning_rate": 0.00019301957316232658, + "loss": 0.777, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5356566612020834, + "learning_rate": 0.0001929241151313108, + "loss": 0.8127, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.5113524460982788, + "learning_rate": 0.0001928280327580858, + "loss": 0.8786, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.5291692741521824, + "learning_rate": 0.00019273132668821364, + "loss": 0.8393, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.45843548873895057, + "learning_rate": 0.00019263399757144683, + "loss": 0.7901, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.46690384493517495, + "learning_rate": 0.00019253604606172417, + "loss": 0.8058, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.42269965222515665, + "learning_rate": 0.000192437472817166, + "loss": 0.7408, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.4744441145268392, + "learning_rate": 0.00019233827850007027, + "loss": 0.8013, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.589734999819277, + "learning_rate": 0.00019223846377690754, + "loss": 0.8505, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4294515422334175, + "learning_rate": 0.00019213802931831696, + "loss": 0.7768, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.4744430990923323, + "learning_rate": 0.00019203697579910154, + "loss": 0.7609, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4559501850152181, + "learning_rate": 0.00019193530389822363, + "loss": 0.7523, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.46357087350772636, + "learning_rate": 0.00019183301429880043, + "loss": 0.7601, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.46023959041271456, + "learning_rate": 0.00019173010768809933, + "loss": 0.796, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.4727064727182715, + "learning_rate": 0.00019162658475753327, + "loss": 0.7954, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4569969267217822, + "learning_rate": 0.0001915224462026563, + "loss": 0.8656, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.5938746073907192, + "learning_rate": 0.00019141769272315858, + "loss": 0.7622, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.48850488082059007, + "learning_rate": 0.00019131232502286188, + "loss": 0.8259, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.4681251940836538, + "learning_rate": 0.00019120634380971496, + "loss": 0.7065, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4735748030725397, + "learning_rate": 0.0001910997497957885, + "loss": 0.8068, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.47529092333896694, + "learning_rate": 0.0001909925436972706, + "loss": 0.7315, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5902306759720399, + "learning_rate": 0.00019088472623446183, + "loss": 0.8338, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.5230818479222556, + "learning_rate": 0.00019077629813177036, + "loss": 0.8022, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.4534663637266558, + "learning_rate": 0.00019066726011770726, + "loss": 0.7373, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.4708997916866323, + "learning_rate": 0.00019055761292488142, + "loss": 0.7071, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.49362197885694487, + "learning_rate": 0.0001904473572899947, + "loss": 0.7933, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.45203709880515, + "learning_rate": 0.00019033649395383702, + "loss": 0.7832, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.5493177817059253, + "learning_rate": 0.00019022502366128135, + "loss": 0.8254, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.5634237288206593, + "learning_rate": 0.00019011294716127867, + "loss": 0.8056, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.42048467773105236, + "learning_rate": 0.00019000026520685302, + "loss": 0.7727, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.4899154747867312, + "learning_rate": 0.0001898869785550963, + "loss": 0.8376, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.5710692282020838, + "learning_rate": 0.0001897730879671634, + "loss": 0.8309, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.48806709144026006, + "learning_rate": 0.00018965859420826684, + "loss": 0.7442, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4838117679253375, + "learning_rate": 0.00018954349804767184, + "loss": 0.787, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.4580816421898731, + "learning_rate": 0.00018942780025869098, + "loss": 0.8085, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.47596264671869476, + "learning_rate": 0.00018931150161867916, + "loss": 0.7975, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.44108018864388043, + "learning_rate": 0.00018919460290902826, + "loss": 0.7784, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.5149929952820737, + "learning_rate": 0.00018907710491516199, + "loss": 0.6841, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.6399958813065738, + "learning_rate": 0.0001889590084265304, + "loss": 0.8172, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4698697208453852, + "learning_rate": 0.0001888403142366049, + "loss": 0.7776, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.4801018226598063, + "learning_rate": 0.0001887210231428727, + "loss": 0.7831, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5010754074291259, + "learning_rate": 0.00018860113594683148, + "loss": 0.8059, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.537583317946107, + "learning_rate": 0.0001884806534539841, + "loss": 0.8351, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5463136427932224, + "learning_rate": 0.00018835957647383303, + "loss": 0.8493, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.4957532756389482, + "learning_rate": 0.0001882379058198751, + "loss": 0.7829, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4842215035332732, + "learning_rate": 0.00018811564230959588, + "loss": 0.776, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.48734113067673895, + "learning_rate": 0.00018799278676446423, + "loss": 0.7234, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.47276663145448383, + "learning_rate": 0.00018786934000992688, + "loss": 0.7549, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.43463312763994316, + "learning_rate": 0.00018774530287540278, + "loss": 0.8188, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5307171453933577, + "learning_rate": 0.00018762067619427746, + "loss": 0.8506, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.5793489529825944, + "learning_rate": 0.00018749546080389757, + "loss": 0.857, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.41645727653838105, + "learning_rate": 0.00018736965754556528, + "loss": 0.7675, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.4724064375837242, + "learning_rate": 0.00018724326726453244, + "loss": 0.815, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.533774522533951, + "learning_rate": 0.00018711629080999504, + "loss": 0.7672, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.5541004916699475, + "learning_rate": 0.00018698872903508755, + "loss": 0.8889, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.5661791909987512, + "learning_rate": 0.00018686058279687698, + "loss": 0.8735, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.44873804371044396, + "learning_rate": 0.0001867318529563574, + "loss": 0.7042, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.5304400188736889, + "learning_rate": 0.00018660254037844388, + "loss": 0.7533, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.44327993489571377, + "learning_rate": 0.00018647264593196688, + "loss": 0.7866, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.5255641319650115, + "learning_rate": 0.00018634217048966637, + "loss": 0.8282, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.46028409296582895, + "learning_rate": 0.00018621111492818585, + "loss": 0.7753, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5031247287857912, + "learning_rate": 0.0001860794801280666, + "loss": 0.8181, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.6173573256457165, + "learning_rate": 0.00018594726697374175, + "loss": 0.9006, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5298715602472197, + "learning_rate": 0.0001858144763535302, + "loss": 0.839, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.4911255833322417, + "learning_rate": 0.0001856811091596308, + "loss": 0.759, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4906696061287372, + "learning_rate": 0.0001855471662881164, + "loss": 0.8528, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.4668567335860601, + "learning_rate": 0.00018541264863892754, + "loss": 0.8208, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.4345032605839861, + "learning_rate": 0.00018527755711586678, + "loss": 0.8167, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.4984211985447478, + "learning_rate": 0.00018514189262659235, + "loss": 0.8302, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4572714117537344, + "learning_rate": 0.00018500565608261214, + "loss": 0.8184, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.5206899190878905, + "learning_rate": 0.00018486884839927768, + "loss": 0.8066, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.4854794107900831, + "learning_rate": 0.00018473147049577774, + "loss": 0.7537, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.4609677848342807, + "learning_rate": 0.0001845935232951325, + "loss": 0.8063, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.45996489083268266, + "learning_rate": 0.00018445500772418697, + "loss": 0.8506, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.45707766102052144, + "learning_rate": 0.00018431592471360503, + "loss": 0.7912, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4340542190902398, + "learning_rate": 0.00018417627519786315, + "loss": 0.7575, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.5497680661505181, + "learning_rate": 0.000184036060115244, + "loss": 0.8477, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.43255985506713984, + "learning_rate": 0.00018389528040783012, + "loss": 0.7152, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.4520541936610211, + "learning_rate": 0.00018375393702149787, + "loss": 0.7711, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5552186923148112, + "learning_rate": 0.00018361203090591071, + "loss": 0.8749, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.5240005896907025, + "learning_rate": 0.00018346956301451304, + "loss": 0.8577, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5080128301456308, + "learning_rate": 0.00018332653430452376, + "loss": 0.8368, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.4512704910911856, + "learning_rate": 0.00018318294573692985, + "loss": 0.7881, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.425427268205857, + "learning_rate": 0.00018303879827647975, + "loss": 0.7531, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.46175851059255807, + "learning_rate": 0.0001828940928916772, + "loss": 0.7513, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.43532549688496996, + "learning_rate": 0.00018274883055477436, + "loss": 0.7672, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.5150647397664155, + "learning_rate": 0.00018260301224176558, + "loss": 0.8248, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.5015860279713167, + "learning_rate": 0.00018245663893238075, + "loss": 0.7788, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.4094608213167549, + "learning_rate": 0.00018230971161007853, + "loss": 0.8107, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4506121936050682, + "learning_rate": 0.00018216223126204007, + "loss": 0.785, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.38251503700435563, + "learning_rate": 0.00018201419887916214, + "loss": 0.7542, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.6011698005621798, + "learning_rate": 0.00018186561545605054, + "loss": 0.8012, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.637325574994567, + "learning_rate": 0.00018171648199101346, + "loss": 0.8774, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.48830475781118704, + "learning_rate": 0.00018156679948605467, + "loss": 0.7632, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.437959458199466, + "learning_rate": 0.00018141656894686689, + "loss": 0.7968, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.40995994254323775, + "learning_rate": 0.00018126579138282503, + "loss": 0.7322, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.4760848633445986, + "learning_rate": 0.00018111446780697929, + "loss": 0.8289, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.3968332181154129, + "learning_rate": 0.0001809625992360485, + "loss": 0.7367, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.5853394391073075, + "learning_rate": 0.00018081018669041324, + "loss": 0.9217, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5055996499763276, + "learning_rate": 0.00018065723119410884, + "loss": 0.7689, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.5171528589084252, + "learning_rate": 0.00018050373377481878, + "loss": 0.8076, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.48753167002443065, + "learning_rate": 0.00018034969546386757, + "loss": 0.7365, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.5593692966320034, + "learning_rate": 0.0001801951172962139, + "loss": 0.8296, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4894130517380097, + "learning_rate": 0.0001800400003104436, + "loss": 0.7783, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.4183032507797536, + "learning_rate": 0.0001798843455487629, + "loss": 0.7159, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4854918953495465, + "learning_rate": 0.00017972815405699103, + "loss": 0.7768, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.45780025047823086, + "learning_rate": 0.00017957142688455362, + "loss": 0.8142, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.5152620899441052, + "learning_rate": 0.00017941416508447536, + "loss": 0.7786, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.5549668785671925, + "learning_rate": 0.00017925636971337304, + "loss": 0.7827, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.544306140170751, + "learning_rate": 0.0001790980418314484, + "loss": 0.8029, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.46360456212875956, + "learning_rate": 0.00017893918250248104, + "loss": 0.7919, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5060777619170408, + "learning_rate": 0.00017877979279382135, + "loss": 0.8377, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.4496308552363371, + "learning_rate": 0.00017861987377638312, + "loss": 0.8139, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5007864295328988, + "learning_rate": 0.0001784594265246366, + "loss": 0.825, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.4483008493163282, + "learning_rate": 0.0001782984521166011, + "loss": 0.7863, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4391112870508659, + "learning_rate": 0.0001781369516338378, + "loss": 0.7797, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.47018233251197405, + "learning_rate": 0.00017797492616144256, + "loss": 0.7924, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.4798517777508943, + "learning_rate": 0.00017781237678803847, + "loss": 0.7745, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.49095855616960954, + "learning_rate": 0.00017764930460576866, + "loss": 0.765, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.48356101095398024, + "learning_rate": 0.000177485710710289, + "loss": 0.7755, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.5580403015191725, + "learning_rate": 0.00017732159620076053, + "loss": 0.7914, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4713539577022985, + "learning_rate": 0.00017715696217984235, + "loss": 0.7745, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.5205108027761377, + "learning_rate": 0.00017699180975368396, + "loss": 0.7512, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.46116331908739044, + "learning_rate": 0.00017682614003191807, + "loss": 0.8286, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.4577523085902634, + "learning_rate": 0.00017665995412765285, + "loss": 0.7494, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3847018815541359, + "learning_rate": 0.00017649325315746478, + "loss": 0.7431, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.46143177187220114, + "learning_rate": 0.00017632603824139085, + "loss": 0.8167, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.4546653053351597, + "learning_rate": 0.0001761583105029213, + "loss": 0.7887, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.48515315632333816, + "learning_rate": 0.0001759900710689918, + "loss": 0.7682, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.42125610166318656, + "learning_rate": 0.00017582132106997616, + "loss": 0.8073, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.5224634896260647, + "learning_rate": 0.00017565206163967846, + "loss": 0.7344, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4258122656750229, + "learning_rate": 0.00017548229391532572, + "loss": 0.8215, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.4409516005421548, + "learning_rate": 0.00017531201903755994, + "loss": 0.7652, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.5402199309929968, + "learning_rate": 0.00017514123815043074, + "loss": 0.8455, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.4327780968915563, + "learning_rate": 0.00017496995240138744, + "loss": 0.8579, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.36305977653280086, + "learning_rate": 0.00017479816294127152, + "loss": 0.7184, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.48196845883166595, + "learning_rate": 0.00017462587092430875, + "loss": 0.8101, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.5203666483240921, + "learning_rate": 0.0001744530775081015, + "loss": 0.8734, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.4146613232763745, + "learning_rate": 0.00017427978385362112, + "loss": 0.7822, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.6393924480453517, + "learning_rate": 0.0001741059911251997, + "loss": 0.8473, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.5014244971007478, + "learning_rate": 0.0001739317004905227, + "loss": 0.7994, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4467471708728824, + "learning_rate": 0.000173756913120621, + "loss": 0.715, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.44927464260407846, + "learning_rate": 0.00017358163018986282, + "loss": 0.7876, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.452092010139311, + "learning_rate": 0.00017340585287594604, + "loss": 0.7384, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.4505483607042589, + "learning_rate": 0.00017322958235989016, + "loss": 0.7741, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.45737335478669194, + "learning_rate": 0.0001730528198260285, + "loss": 0.7318, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.505105062630817, + "learning_rate": 0.00017287556646200018, + "loss": 0.8462, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.5485209182353045, + "learning_rate": 0.00017269782345874203, + "loss": 0.8205, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.47935014604862003, + "learning_rate": 0.00017251959201048083, + "loss": 0.7916, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.46851260385721555, + "learning_rate": 0.00017234087331472497, + "loss": 0.8087, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.40742467259083315, + "learning_rate": 0.00017216166857225674, + "loss": 0.7288, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.42412473687814045, + "learning_rate": 0.00017198197898712404, + "loss": 0.7749, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.4540663038141919, + "learning_rate": 0.00017180180576663228, + "loss": 0.8279, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4814886187089166, + "learning_rate": 0.00017162115012133643, + "loss": 0.7954, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.38612348667503354, + "learning_rate": 0.00017144001326503273, + "loss": 0.684, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.43797411883152226, + "learning_rate": 0.00017125839641475072, + "loss": 0.8172, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.38992309677963055, + "learning_rate": 0.00017107630079074478, + "loss": 0.7729, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.4317859019917533, + "learning_rate": 0.00017089372761648616, + "loss": 0.8122, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.42022534256609423, + "learning_rate": 0.00017071067811865476, + "loss": 0.777, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.40811977801292604, + "learning_rate": 0.00017052715352713075, + "loss": 0.7008, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.4297400709769793, + "learning_rate": 0.00017034315507498635, + "loss": 0.7693, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4801445875017632, + "learning_rate": 0.00017015868399847768, + "loss": 0.8153, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.4934539899070924, + "learning_rate": 0.00016997374153703625, + "loss": 0.8032, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.42513152532831566, + "learning_rate": 0.00016978832893326074, + "loss": 0.7752, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.4794991935777699, + "learning_rate": 0.00016960244743290868, + "loss": 0.7782, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5087050287378877, + "learning_rate": 0.00016941609828488807, + "loss": 0.8042, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.393865561718882, + "learning_rate": 0.00016922928274124886, + "loss": 0.6906, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.42383628171051063, + "learning_rate": 0.0001690420020571747, + "loss": 0.7239, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.6121834981137757, + "learning_rate": 0.00016885425749097444, + "loss": 0.828, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4423952298587943, + "learning_rate": 0.0001686660503040737, + "loss": 0.7793, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.41904903851629355, + "learning_rate": 0.00016847738176100632, + "loss": 0.7222, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.47090218219971486, + "learning_rate": 0.00016828825312940592, + "loss": 0.7864, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.3913200013056769, + "learning_rate": 0.0001680986656799975, + "loss": 0.7107, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4500172163246463, + "learning_rate": 0.0001679086206865886, + "loss": 0.7604, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.47203881705422795, + "learning_rate": 0.00016771811942606108, + "loss": 0.775, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.449367277679672, + "learning_rate": 0.00016752716317836229, + "loss": 0.7392, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.3621692554096837, + "learning_rate": 0.00016733575322649657, + "loss": 0.6913, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.4705659025575893, + "learning_rate": 0.0001671438908565167, + "loss": 0.8159, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.4391531794127377, + "learning_rate": 0.00016695157735751513, + "loss": 0.7539, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.5471100071607476, + "learning_rate": 0.00016675881402161536, + "loss": 0.7569, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.39983579832917754, + "learning_rate": 0.0001665656021439633, + "loss": 0.7594, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5433063322457978, + "learning_rate": 0.0001663719430227186, + "loss": 0.7739, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.5158257022171956, + "learning_rate": 0.00016617783795904565, + "loss": 0.7932, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.43334724350730025, + "learning_rate": 0.00016598328825710533, + "loss": 0.7747, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.5418026742631086, + "learning_rate": 0.00016578829522404583, + "loss": 0.8507, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.49728990873135537, + "learning_rate": 0.000165592860169994, + "loss": 0.7716, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.4930679346000872, + "learning_rate": 0.00016539698440804661, + "loss": 0.8152, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.40703810585703953, + "learning_rate": 0.00016520066925426144, + "loss": 0.7123, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.5386416550361496, + "learning_rate": 0.0001650039160276485, + "loss": 0.788, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.6681961851973703, + "learning_rate": 0.0001648067260501611, + "loss": 0.7419, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.46698423556575525, + "learning_rate": 0.0001646091006466871, + "loss": 0.7497, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4462693969319942, + "learning_rate": 0.0001644110411450398, + "loss": 0.795, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.5162984669055487, + "learning_rate": 0.00016421254887594917, + "loss": 0.7959, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.48234426398524705, + "learning_rate": 0.00016401362517305296, + "loss": 0.7843, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.46155437780670966, + "learning_rate": 0.00016381427137288754, + "loss": 0.7643, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5104579264182729, + "learning_rate": 0.00016361448881487914, + "loss": 0.7618, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.4576691284516531, + "learning_rate": 0.0001634142788413346, + "loss": 0.7593, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.45151819648907454, + "learning_rate": 0.00016321364279743266, + "loss": 0.7874, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.5027216665292459, + "learning_rate": 0.00016301258203121462, + "loss": 0.7623, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4322680615922771, + "learning_rate": 0.0001628110978935756, + "loss": 0.7644, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.458154934846738, + "learning_rate": 0.00016260919173825508, + "loss": 0.7795, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.46071618810128806, + "learning_rate": 0.00016240686492182804, + "loss": 0.8209, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.4394451330648298, + "learning_rate": 0.00016220411880369601, + "loss": 0.7604, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.42064813035167875, + "learning_rate": 0.00016200095474607753, + "loss": 0.7284, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.4554895386626795, + "learning_rate": 0.00016179737411399926, + "loss": 0.7627, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5299365332857332, + "learning_rate": 0.00016159337827528685, + "loss": 0.798, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.4636899840429361, + "learning_rate": 0.00016138896860055555, + "loss": 0.8192, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.46048503472651314, + "learning_rate": 0.0001611841464632011, + "loss": 0.7394, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.4170627043386341, + "learning_rate": 0.00016097891323939062, + "loss": 0.763, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.441786177043673, + "learning_rate": 0.0001607732703080532, + "loss": 0.7349, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.3906300471935325, + "learning_rate": 0.00016056721905087056, + "loss": 0.7193, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.3820125321487123, + "learning_rate": 0.00016036076085226814, + "loss": 0.6502, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.5236625305576288, + "learning_rate": 0.00016015389709940538, + "loss": 0.7858, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.49316645616575255, + "learning_rate": 0.0001599466291821666, + "loss": 0.7877, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.45537293006273666, + "learning_rate": 0.0001597389584931517, + "loss": 0.744, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.43520294503636914, + "learning_rate": 0.0001595308864276666, + "loss": 0.7514, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.5263441910538149, + "learning_rate": 0.0001593224143837142, + "loss": 0.7887, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.43733536031533465, + "learning_rate": 0.0001591135437619847, + "loss": 0.8091, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.4509836961161326, + "learning_rate": 0.00015890427596584617, + "loss": 0.7972, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.3697325898617728, + "learning_rate": 0.0001586946124013354, + "loss": 0.7448, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.4149836456688446, + "learning_rate": 0.00015848455447714822, + "loss": 0.7589, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5291721262591771, + "learning_rate": 0.0001582741036046301, + "loss": 0.8298, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.4592801100745272, + "learning_rate": 0.00015806326119776663, + "loss": 0.6963, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.48305455164459193, + "learning_rate": 0.00015785202867317407, + "loss": 0.7279, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.43414795715124715, + "learning_rate": 0.00015764040745008988, + "loss": 0.7333, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4232467191471432, + "learning_rate": 0.00015742839895036305, + "loss": 0.8361, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.37099563519311474, + "learning_rate": 0.00015721600459844468, + "loss": 0.6612, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.463495269434473, + "learning_rate": 0.00015700322582137827, + "loss": 0.7525, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.4051273675714783, + "learning_rate": 0.00015679006404879033, + "loss": 0.7357, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.44065204587172313, + "learning_rate": 0.0001565765207128805, + "loss": 0.7433, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.404647063494323, + "learning_rate": 0.00015636259724841222, + "loss": 0.6644, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4367938419748247, + "learning_rate": 0.0001561482950927029, + "loss": 0.8084, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.4291710814932179, + "learning_rate": 0.00015593361568561428, + "loss": 0.6774, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3710638351771584, + "learning_rate": 0.00015571856046954285, + "loss": 0.6896, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.47217298232188626, + "learning_rate": 0.0001555031308894101, + "loss": 0.8498, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5599795121408392, + "learning_rate": 0.00015528732839265272, + "loss": 0.8025, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.4820028429977975, + "learning_rate": 0.0001550711544292131, + "loss": 0.8019, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.8103269160239212, + "learning_rate": 0.0001548546104515294, + "loss": 0.8533, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.43886002005983965, + "learning_rate": 0.00015463769791452574, + "loss": 0.7961, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.4264274993725706, + "learning_rate": 0.00015442041827560274, + "loss": 0.7368, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.37652450930794035, + "learning_rate": 0.00015420277299462736, + "loss": 0.6803, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4843136136830526, + "learning_rate": 0.00015398476353392323, + "loss": 0.7653, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.4645581295541479, + "learning_rate": 0.00015376639135826107, + "loss": 0.7339, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4039874199374106, + "learning_rate": 0.00015354765793484834, + "loss": 0.7783, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.3968349526012385, + "learning_rate": 0.00015332856473331978, + "loss": 0.7276, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4033776124012642, + "learning_rate": 0.00015310911322572753, + "loss": 0.7334, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.44970638069998753, + "learning_rate": 0.00015288930488653094, + "loss": 0.7736, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.5501628335450935, + "learning_rate": 0.000152669141192587, + "loss": 0.7011, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.5046304397885234, + "learning_rate": 0.0001524486236231402, + "loss": 0.7764, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.40623832410355104, + "learning_rate": 0.00015222775365981273, + "loss": 0.7166, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.4276686115329016, + "learning_rate": 0.00015200653278659432, + "loss": 0.7184, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.423668162878283, + "learning_rate": 0.00015178496248983254, + "loss": 0.7227, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.3927314621240658, + "learning_rate": 0.00015156304425822267, + "loss": 0.69, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.5119999368926385, + "learning_rate": 0.00015134077958279765, + "loss": 0.7183, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.43203093657186026, + "learning_rate": 0.00015111816995691809, + "loss": 0.743, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.46063329097074923, + "learning_rate": 0.00015089521687626243, + "loss": 0.8199, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.41210379418090887, + "learning_rate": 0.00015067192183881658, + "loss": 0.7144, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.40093045861548976, + "learning_rate": 0.000150448286344864, + "loss": 0.7849, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.6220898454077841, + "learning_rate": 0.00015022431189697568, + "loss": 0.8912, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4973328165816475, + "learning_rate": 0.00015000000000000001, + "loss": 0.8178, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.418748668488691, + "learning_rate": 0.0001497753521610526, + "loss": 0.6893, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4275334770458131, + "learning_rate": 0.00014955036988950618, + "loss": 0.7458, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.4258922571938356, + "learning_rate": 0.00014932505469698052, + "loss": 0.7532, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.47720760098269094, + "learning_rate": 0.00014909940809733222, + "loss": 0.8392, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.44154393073423315, + "learning_rate": 0.0001488734316066446, + "loss": 0.7193, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5401432877123872, + "learning_rate": 0.00014864712674321734, + "loss": 0.843, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.5545136321727958, + "learning_rate": 0.0001484204950275565, + "loss": 0.8185, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.5771994605485644, + "learning_rate": 0.00014819353798236427, + "loss": 0.8423, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.44985387889753464, + "learning_rate": 0.00014796625713252848, + "loss": 0.683, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3804634086933775, + "learning_rate": 0.00014773865400511272, + "loss": 0.6856, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.4327700820962517, + "learning_rate": 0.00014751073012934587, + "loss": 0.7165, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4595450183265034, + "learning_rate": 0.00014728248703661182, + "loss": 0.7779, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.41463583124442244, + "learning_rate": 0.0001470539262604393, + "loss": 0.6957, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4330408248305494, + "learning_rate": 0.00014682504933649144, + "loss": 0.7895, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.4300916093302638, + "learning_rate": 0.00014659585780255556, + "loss": 0.6724, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.44397586704291414, + "learning_rate": 0.00014636635319853275, + "loss": 0.743, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.40246850409442514, + "learning_rate": 0.0001461365370664276, + "loss": 0.7071, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.45024266081088654, + "learning_rate": 0.00014590641095033787, + "loss": 0.739, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.4682043754286765, + "learning_rate": 0.00014567597639644387, + "loss": 0.8009, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.48680932957793543, + "learning_rate": 0.00014544523495299842, + "loss": 0.7452, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.41878898691483496, + "learning_rate": 0.00014521418817031628, + "loss": 0.7418, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4319413092291526, + "learning_rate": 0.0001449828376007636, + "loss": 0.6993, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.39720449382891126, + "learning_rate": 0.00014475118479874774, + "loss": 0.7198, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.46131919117475345, + "learning_rate": 0.0001445192313207067, + "loss": 0.7732, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.47845851428090297, + "learning_rate": 0.0001442869787250987, + "loss": 0.7577, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4193860309688804, + "learning_rate": 0.0001440544285723915, + "loss": 0.6731, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.5199556138782738, + "learning_rate": 0.00014382158242505234, + "loss": 0.8273, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.443457538816095, + "learning_rate": 0.00014358844184753712, + "loss": 0.7714, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.4982474765380297, + "learning_rate": 0.00014335500840627986, + "loss": 0.7796, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.45278238492691764, + "learning_rate": 0.00014312128366968243, + "loss": 0.7355, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.4328358546149195, + "learning_rate": 0.0001428872692081038, + "loss": 0.765, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.39592843909567915, + "learning_rate": 0.00014265296659384956, + "loss": 0.7776, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.46452178512022113, + "learning_rate": 0.00014241837740116132, + "loss": 0.7896, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.41833633625501077, + "learning_rate": 0.00014218350320620624, + "loss": 0.7048, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.514852927784867, + "learning_rate": 0.00014194834558706632, + "loss": 0.7816, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5257797374413313, + "learning_rate": 0.0001417129061237278, + "loss": 0.8428, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.45636398078426904, + "learning_rate": 0.0001414771863980707, + "loss": 0.8619, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.4502039476106067, + "learning_rate": 0.00014124118799385796, + "loss": 0.7963, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.4243820031639593, + "learning_rate": 0.00014100491249672498, + "loss": 0.784, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.5184189715289644, + "learning_rate": 0.00014076836149416887, + "loss": 0.7735, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.4547892891311182, + "learning_rate": 0.0001405315365755379, + "loss": 0.7617, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.48233412931694103, + "learning_rate": 0.0001402944393320206, + "loss": 0.7788, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.40083273522540647, + "learning_rate": 0.00014005707135663527, + "loss": 0.7021, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.6194283397199801, + "learning_rate": 0.00013981943424421932, + "loss": 0.726, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.39974681678200796, + "learning_rate": 0.00013958152959141825, + "loss": 0.7531, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4422340111807094, + "learning_rate": 0.00013934335899667527, + "loss": 0.822, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.3887067782631618, + "learning_rate": 0.00013910492406022033, + "loss": 0.6595, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.553404541087564, + "learning_rate": 0.00013886622638405952, + "loss": 0.894, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.41896343138385883, + "learning_rate": 0.0001386272675719642, + "loss": 0.7361, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3738436164502759, + "learning_rate": 0.00013838804922946027, + "loss": 0.7367, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.4483262760759903, + "learning_rate": 0.00013814857296381728, + "loss": 0.815, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4395461639913433, + "learning_rate": 0.00013790884038403795, + "loss": 0.7917, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.49483075047931446, + "learning_rate": 0.00013766885310084688, + "loss": 0.7537, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.44992977192842065, + "learning_rate": 0.00013742861272668012, + "loss": 0.7668, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.41950839692838165, + "learning_rate": 0.00013718812087567414, + "loss": 0.6612, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.4087702174450992, + "learning_rate": 0.00013694737916365517, + "loss": 0.6966, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.4183706402748134, + "learning_rate": 0.000136706389208128, + "loss": 0.6523, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.42319846444919756, + "learning_rate": 0.00013646515262826552, + "loss": 0.7032, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.38483029451499023, + "learning_rate": 0.00013622367104489756, + "loss": 0.6604, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.4034412002610339, + "learning_rate": 0.0001359819460805001, + "loss": 0.7194, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.4929472362741926, + "learning_rate": 0.0001357399793591844, + "loss": 0.7285, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.42801487666111165, + "learning_rate": 0.0001354977725066859, + "loss": 0.7555, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.5219648385158142, + "learning_rate": 0.00013525532715035366, + "loss": 0.7975, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.5078211284193989, + "learning_rate": 0.00013501264491913906, + "loss": 0.8362, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.39187948139759565, + "learning_rate": 0.00013476972744358507, + "loss": 0.6311, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.45280979149866263, + "learning_rate": 0.0001345265763558152, + "loss": 0.7127, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.49302221491791043, + "learning_rate": 0.00013428319328952253, + "loss": 0.7644, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.4348177938826427, + "learning_rate": 0.00013403957987995882, + "loss": 0.7979, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.4406643805434921, + "learning_rate": 0.0001337957377639235, + "loss": 0.7121, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.48066300539551704, + "learning_rate": 0.0001335516685797525, + "loss": 0.7621, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.45414616840784405, + "learning_rate": 0.0001333073739673076, + "loss": 0.7462, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3984240043819688, + "learning_rate": 0.00013306285556796495, + "loss": 0.6972, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.6071295742863728, + "learning_rate": 0.0001328181150246045, + "loss": 0.7026, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4134558242804434, + "learning_rate": 0.00013257315398159864, + "loss": 0.686, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.35424121409366033, + "learning_rate": 0.00013232797408480127, + "loss": 0.6849, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.40853146718648076, + "learning_rate": 0.00013208257698153677, + "loss": 0.7584, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.4504575641258578, + "learning_rate": 0.00013183696432058888, + "loss": 0.7417, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.4048789202428273, + "learning_rate": 0.00013159113775218964, + "loss": 0.7273, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.5159833545425263, + "learning_rate": 0.00013134509892800822, + "loss": 0.7396, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.45391289698427356, + "learning_rate": 0.00013109884950114007, + "loss": 0.7066, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.5330981927253413, + "learning_rate": 0.00013085239112609547, + "loss": 0.7406, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4552377694570901, + "learning_rate": 0.00013060572545878875, + "loss": 0.7413, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.4220690556860606, + "learning_rate": 0.00013035885415652685, + "loss": 0.7691, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4156735467250489, + "learning_rate": 0.00013011177887799845, + "loss": 0.737, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.4849548220572441, + "learning_rate": 0.00012986450128326266, + "loss": 0.802, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3821922408993975, + "learning_rate": 0.00012961702303373795, + "loss": 0.7042, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.4391419153908085, + "learning_rate": 0.00012936934579219094, + "loss": 0.7476, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.48605094281470546, + "learning_rate": 0.00012912147122272523, + "loss": 0.7299, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.44719999619049045, + "learning_rate": 0.00012887340099077024, + "loss": 0.6288, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.37042463200431835, + "learning_rate": 0.00012862513676307008, + "loss": 0.6195, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.44775857852904416, + "learning_rate": 0.0001283766802076722, + "loss": 0.7556, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.42829522654490987, + "learning_rate": 0.00012812803299391628, + "loss": 0.681, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.5928330924887514, + "learning_rate": 0.00012787919679242306, + "loss": 0.8601, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4594659671403964, + "learning_rate": 0.00012763017327508305, + "loss": 0.7738, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.4465430347318887, + "learning_rate": 0.00012738096411504522, + "loss": 0.7374, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.4268335673175506, + "learning_rate": 0.0001271315709867059, + "loss": 0.709, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.41663005710297984, + "learning_rate": 0.00012688199556569753, + "loss": 0.7658, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.5040094125671729, + "learning_rate": 0.00012663223952887723, + "loss": 0.7884, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.5921053420891155, + "learning_rate": 0.0001263823045543158, + "loss": 0.7267, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.46146114546649125, + "learning_rate": 0.00012613219232128608, + "loss": 0.7407, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.524385845131032, + "learning_rate": 0.00012588190451025207, + "loss": 0.7407, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3908346833006848, + "learning_rate": 0.00012563144280285741, + "loss": 0.6748, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.6078837577120809, + "learning_rate": 0.00012538080888191408, + "loss": 0.8619, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.5322345512743014, + "learning_rate": 0.00012513000443139112, + "loss": 0.8594, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.4634894284627779, + "learning_rate": 0.00012487903113640337, + "loss": 0.7469, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.39088436223660744, + "learning_rate": 0.00012462789068320017, + "loss": 0.6933, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.6228952211808515, + "learning_rate": 0.00012437658475915377, + "loss": 0.924, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.39138157660741213, + "learning_rate": 0.00012412511505274844, + "loss": 0.7763, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.3461019712679376, + "learning_rate": 0.00012387348325356874, + "loss": 0.701, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4471624419737993, + "learning_rate": 0.00012362169105228826, + "loss": 0.7772, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.4150378407307194, + "learning_rate": 0.00012336974014065844, + "loss": 0.8258, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.37942087611900394, + "learning_rate": 0.000123117632211497, + "loss": 0.7186, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.47123832417396183, + "learning_rate": 0.00012286536895867654, + "loss": 0.8035, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4504207833314274, + "learning_rate": 0.00012261295207711346, + "loss": 0.7521, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.4428425121890588, + "learning_rate": 0.00012236038326275626, + "loss": 0.7257, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4015281007614013, + "learning_rate": 0.0001221076642125742, + "loss": 0.7097, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.4478899906730631, + "learning_rate": 0.00012185479662454595, + "loss": 0.7757, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.5119729739688551, + "learning_rate": 0.00012160178219764837, + "loss": 0.7681, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.40673486902303113, + "learning_rate": 0.00012134862263184467, + "loss": 0.7142, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3726644043800367, + "learning_rate": 0.00012109531962807332, + "loss": 0.7259, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.4138540260403324, + "learning_rate": 0.00012084187488823657, + "loss": 0.722, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.41276783524920124, + "learning_rate": 0.00012058829011518896, + "loss": 0.7666, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.3931117358126695, + "learning_rate": 0.00012033456701272576, + "loss": 0.7441, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.5149228654463832, + "learning_rate": 0.00012008070728557186, + "loss": 0.7127, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.38730792703104727, + "learning_rate": 0.00011982671263936995, + "loss": 0.6845, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.5042696123128442, + "learning_rate": 0.00011957258478066931, + "loss": 0.7188, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.5211459225121182, + "learning_rate": 0.00011931832541691418, + "loss": 0.7415, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.47237500746277383, + "learning_rate": 0.00011906393625643244, + "loss": 0.6767, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.44101750359701125, + "learning_rate": 0.00011880941900842397, + "loss": 0.7766, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4376179162106715, + "learning_rate": 0.00011855477538294935, + "loss": 0.6794, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.443132823304689, + "learning_rate": 0.00011830000709091815, + "loss": 0.7069, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4121264584438154, + "learning_rate": 0.00011804511584407763, + "loss": 0.7014, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.5438632693866221, + "learning_rate": 0.0001177901033550012, + "loss": 0.7113, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4363082522071395, + "learning_rate": 0.00011753497133707679, + "loss": 0.6613, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.5094548228524022, + "learning_rate": 0.00011727972150449544, + "loss": 0.7701, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.42663679470568217, + "learning_rate": 0.00011702435557223987, + "loss": 0.7509, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.371674990751324, + "learning_rate": 0.00011676887525607271, + "loss": 0.7048, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.42718377435661903, + "learning_rate": 0.00011651328227252517, + "loss": 0.7327, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.4654016531114095, + "learning_rate": 0.00011625757833888551, + "loss": 0.7099, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.42069923667116677, + "learning_rate": 0.00011600176517318741, + "loss": 0.766, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.5169680150545506, + "learning_rate": 0.0001157458444941984, + "loss": 0.7042, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.523955961930124, + "learning_rate": 0.00011548981802140848, + "loss": 0.7869, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.4385714543537812, + "learning_rate": 0.00011523368747501839, + "loss": 0.8399, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.6340599411671402, + "learning_rate": 0.00011497745457592816, + "loss": 0.7222, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.47338371501262316, + "learning_rate": 0.00011472112104572547, + "loss": 0.6802, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4151024888514726, + "learning_rate": 0.00011446468860667421, + "loss": 0.7212, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.4266234124682058, + "learning_rate": 0.0001142081589817027, + "loss": 0.6584, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.43762734425987504, + "learning_rate": 0.00011395153389439233, + "loss": 0.7147, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.4134839753893968, + "learning_rate": 0.00011369481506896582, + "loss": 0.7353, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.40694547601123865, + "learning_rate": 0.00011343800423027582, + "loss": 0.7192, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.3564513017284066, + "learning_rate": 0.00011318110310379301, + "loss": 0.6465, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3845545303416391, + "learning_rate": 0.0001129241134155949, + "loss": 0.7286, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.4163438169383608, + "learning_rate": 0.00011266703689235394, + "loss": 0.6846, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.47824121432178024, + "learning_rate": 0.00011240987526132594, + "loss": 0.6886, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.5266270866168399, + "learning_rate": 0.00011215263025033869, + "loss": 0.7648, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.5822376742880172, + "learning_rate": 0.00011189530358778005, + "loss": 0.8485, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.5542816264186629, + "learning_rate": 0.00011163789700258655, + "loss": 0.7644, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.45585766253161186, + "learning_rate": 0.00011138041222423177, + "loss": 0.7989, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.4333560334878548, + "learning_rate": 0.00011112285098271451, + "loss": 0.7552, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.4609090181524356, + "learning_rate": 0.00011086521500854745, + "loss": 0.7392, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.4800939290535579, + "learning_rate": 0.00011060750603274535, + "loss": 0.7406, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.39598022061191823, + "learning_rate": 0.00011034972578681338, + "loss": 0.6952, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.5500799247212219, + "learning_rate": 0.00011009187600273566, + "loss": 0.7921, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.47244542913085974, + "learning_rate": 0.00010983395841296348, + "loss": 0.8139, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.42058907283470487, + "learning_rate": 0.00010957597475040373, + "loss": 0.7463, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5014730249496743, + "learning_rate": 0.00010931792674840718, + "loss": 0.7452, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.4745672807425004, + "learning_rate": 0.00010905981614075693, + "loss": 0.781, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.5376525537845632, + "learning_rate": 0.00010880164466165674, + "loss": 0.8153, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.4653577531089348, + "learning_rate": 0.00010854341404571928, + "loss": 0.7893, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4538491457245736, + "learning_rate": 0.00010828512602795462, + "loss": 0.7392, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.5227869138818918, + "learning_rate": 0.00010802678234375851, + "loss": 0.8347, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4550754633589422, + "learning_rate": 0.00010776838472890065, + "loss": 0.7233, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.36374678340939803, + "learning_rate": 0.0001075099349195131, + "loss": 0.6873, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4656775039023448, + "learning_rate": 0.00010725143465207867, + "loss": 0.7299, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.49469206366660884, + "learning_rate": 0.00010699288566341914, + "loss": 0.772, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.392463643989344, + "learning_rate": 0.00010673428969068364, + "loss": 0.7124, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.4133034375857957, + "learning_rate": 0.000106475648471337, + "loss": 0.7065, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.4821380238744263, + "learning_rate": 0.00010621696374314807, + "loss": 0.7867, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.5300861647192535, + "learning_rate": 0.00010595823724417795, + "loss": 0.8107, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.4491881480521485, + "learning_rate": 0.00010569947071276847, + "loss": 0.7575, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.46263469968642656, + "learning_rate": 0.00010544066588753044, + "loss": 0.7372, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.46276631407191143, + "learning_rate": 0.00010518182450733186, + "loss": 0.7248, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.477287224280608, + "learning_rate": 0.00010492294831128641, + "loss": 0.787, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.41135177209160717, + "learning_rate": 0.00010466403903874176, + "loss": 0.7162, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.3898977361266114, + "learning_rate": 0.00010440509842926767, + "loss": 0.6744, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.47144778165038753, + "learning_rate": 0.00010414612822264455, + "loss": 0.8, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.45383248559923595, + "learning_rate": 0.00010388713015885161, + "loss": 0.7055, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.4265627843656265, + "learning_rate": 0.00010362810597805526, + "loss": 0.7289, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.4077987016103004, + "learning_rate": 0.00010336905742059742, + "loss": 0.7298, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4991524391829501, + "learning_rate": 0.0001031099862269837, + "loss": 0.7616, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.4473279610179817, + "learning_rate": 0.0001028508941378719, + "loss": 0.7199, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.433327670096491, + "learning_rate": 0.00010259178289406011, + "loss": 0.7401, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.5549931816783576, + "learning_rate": 0.00010233265423647523, + "loss": 0.8405, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.44976384207067494, + "learning_rate": 0.00010207350990616107, + "loss": 0.7796, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.40329785746260033, + "learning_rate": 0.00010181435164426676, + "loss": 0.6764, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.4734808460625881, + "learning_rate": 0.0001015551811920351, + "loss": 0.678, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.4324182697579957, + "learning_rate": 0.00010129600029079072, + "loss": 0.7169, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.5221590387949478, + "learning_rate": 0.00010103681068192845, + "loss": 0.7415, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.5332044694962755, + "learning_rate": 0.00010077761410690172, + "loss": 0.8014, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.571806435015935, + "learning_rate": 0.00010051841230721065, + "loss": 0.8016, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.49967439182922213, + "learning_rate": 0.00010025920702439051, + "loss": 0.7302, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.49118346108438604, + "learning_rate": 0.0001, + "loss": 0.6872, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.48913866368633124, + "learning_rate": 9.97407929756095e-05, + "loss": 0.7992, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.47013137899782054, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7257, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.4658911806163373, + "learning_rate": 9.92223858930983e-05, + "loss": 0.7299, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.39712521020306313, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6794, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.4180112431814468, + "learning_rate": 9.870399970920932e-05, + "loss": 0.677, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.4284928495017929, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7733, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.41243122791327375, + "learning_rate": 9.818564835573323e-05, + "loss": 0.701, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3688516156765538, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6871, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.4313396253823104, + "learning_rate": 9.766734576352478e-05, + "loss": 0.7334, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.5298882466577094, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7525, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.4753678963133096, + "learning_rate": 9.714910586212816e-05, + "loss": 0.7625, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4197428604140736, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6601, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.525079564441233, + "learning_rate": 9.663094257940258e-05, + "loss": 0.7347, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4677503452316048, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7699, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.42874889487505047, + "learning_rate": 9.611286984114841e-05, + "loss": 0.7174, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.4850354392599207, + "learning_rate": 9.585387177735547e-05, + "loss": 0.8163, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.5325336062346893, + "learning_rate": 9.559490157073236e-05, + "loss": 0.7202, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3720437406408467, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6578, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.4637695460351074, + "learning_rate": 9.507705168871358e-05, + "loss": 0.715, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.41919769770671306, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6684, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.47399139930057216, + "learning_rate": 9.455933411246958e-05, + "loss": 0.7167, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4656387312722789, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7297, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.431769090576024, + "learning_rate": 9.404176275582208e-05, + "loss": 0.7581, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3803590215625145, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6779, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.5603234705750568, + "learning_rate": 9.352435152866298e-05, + "loss": 0.7024, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.5154067125968048, + "learning_rate": 9.326571030931637e-05, + "loss": 0.9104, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.5549373309443812, + "learning_rate": 9.300711433658087e-05, + "loss": 0.815, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.453591654438188, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7581, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.4401941647094853, + "learning_rate": 9.249006508048694e-05, + "loss": 0.6741, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.40812457058737384, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6743, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.3504583347093562, + "learning_rate": 9.197321765624152e-05, + "loss": 0.6269, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4825232064555932, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7192, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.4462020124779728, + "learning_rate": 9.145658595428074e-05, + "loss": 0.6748, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.4621544035458387, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7384, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.4431544880545007, + "learning_rate": 9.09401838592431e-05, + "loss": 0.6995, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.5180969404201087, + "learning_rate": 9.068207325159284e-05, + "loss": 0.796, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.44818368733319086, + "learning_rate": 9.04240252495963e-05, + "loss": 0.726, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.480921118362398, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7016, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.4537241838345266, + "learning_rate": 8.990812399726435e-05, + "loss": 0.6825, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.44379818024999296, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7028, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.4827124310633417, + "learning_rate": 8.939249396725467e-05, + "loss": 0.7388, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.45717227866934373, + "learning_rate": 8.913478499145254e-05, + "loss": 0.8172, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.5407420530627465, + "learning_rate": 8.887714901728551e-05, + "loss": 0.7344, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.5476220204120361, + "learning_rate": 8.861958777576827e-05, + "loss": 0.8205, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.41236078890642863, + "learning_rate": 8.836210299741346e-05, + "loss": 0.6615, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.429507843425321, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7424, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.45789088417995133, + "learning_rate": 8.784736974966135e-05, + "loss": 0.814, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.49840226614272215, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7585, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.4482599702245133, + "learning_rate": 8.733296310764611e-05, + "loss": 0.777, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.41224527407806694, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6788, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.41987569808059994, + "learning_rate": 8.6818896896207e-05, + "loss": 0.7615, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4323759431067474, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7643, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.3675059909087662, + "learning_rate": 8.63051849310342e-05, + "loss": 0.72, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.45712053432264943, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7873, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.41935204964647294, + "learning_rate": 8.579184101829734e-05, + "loss": 0.7268, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.4739297222554467, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6672, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.44991199007186033, + "learning_rate": 8.527887895427454e-05, + "loss": 0.7452, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.42938766367993775, + "learning_rate": 8.502254542407186e-05, + "loss": 0.693, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.41665659561685137, + "learning_rate": 8.476631252498162e-05, + "loss": 0.6783, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.46341772855505137, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7273, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.4182288254216991, + "learning_rate": 8.425415550580162e-05, + "loss": 0.6811, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.5676463520496755, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6955, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.5505236747614208, + "learning_rate": 8.374242166111448e-05, + "loss": 0.7125, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.5167180992588334, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6924, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.4050515316722716, + "learning_rate": 8.323112474392731e-05, + "loss": 0.6586, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.3797701587048492, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6761, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.3820742783131323, + "learning_rate": 8.272027849550457e-05, + "loss": 0.7037, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4207268768764361, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7005, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.5872033549506375, + "learning_rate": 8.220989664499878e-05, + "loss": 0.751, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.41182080280736416, + "learning_rate": 8.195488415592238e-05, + "loss": 0.681, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.48090174763861654, + "learning_rate": 8.169999290908188e-05, + "loss": 0.7419, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5548055849716488, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7366, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.4361079974941745, + "learning_rate": 8.119058099157604e-05, + "loss": 0.6864, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.48496754655537594, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7747, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.4781167281093951, + "learning_rate": 8.068167458308582e-05, + "loss": 0.738, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.4206306860043934, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7593, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.40300826612193685, + "learning_rate": 8.017328736063006e-05, + "loss": 0.6671, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.3756572888549628, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6853, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.3869805827574144, + "learning_rate": 7.966543298727425e-05, + "loss": 0.6424, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.43657568486927284, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7578, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.40679262808071176, + "learning_rate": 7.915812511176347e-05, + "loss": 0.7224, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.441240906989389, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7256, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.4522037045041075, + "learning_rate": 7.865137736815535e-05, + "loss": 0.6383, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.43698704397416593, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6941, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.4765084690325073, + "learning_rate": 7.814520337545406e-05, + "loss": 0.7139, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.6562954145610375, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7757, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.4105045945825311, + "learning_rate": 7.763961673724379e-05, + "loss": 0.6781, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.5251939327477422, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6777, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.5695134795172152, + "learning_rate": 7.713463104132345e-05, + "loss": 0.7216, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.5135868556910134, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7754, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.47165858925231896, + "learning_rate": 7.663025985934158e-05, + "loss": 0.7808, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3880230579417375, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6446, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.5177400802415317, + "learning_rate": 7.61265167464313e-05, + "loss": 0.7938, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.5341808993716982, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6853, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.40063988029996117, + "learning_rate": 7.562341524084623e-05, + "loss": 0.6887, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.44864706816073324, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7598, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.42089206618338676, + "learning_rate": 7.512096886359664e-05, + "loss": 0.7536, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.45402519381629813, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7069, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.46182799064401525, + "learning_rate": 7.461919111808595e-05, + "loss": 0.6344, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4044480795325247, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6877, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.4452394241287927, + "learning_rate": 7.411809548974792e-05, + "loss": 0.724, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.42470769576354267, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6801, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.38070722851677474, + "learning_rate": 7.361769544568425e-05, + "loss": 0.6786, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4279682851412783, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7142, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.5153988835037497, + "learning_rate": 7.311800443430251e-05, + "loss": 0.8058, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.5186003033423139, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7485, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.44215384668103297, + "learning_rate": 7.26190358849548e-05, + "loss": 0.6622, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3953883665933851, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6566, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.5268540652115712, + "learning_rate": 7.212080320757695e-05, + "loss": 0.7611, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4374834330320054, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7196, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.46517122973130165, + "learning_rate": 7.162331979232783e-05, + "loss": 0.7478, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3966377135474766, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6236, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.36283076324324215, + "learning_rate": 7.112659900922976e-05, + "loss": 0.6768, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.4703446177196834, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7447, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.4055321259051819, + "learning_rate": 7.06306542078091e-05, + "loss": 0.6761, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.4617131807146065, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7615, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.40717874666449283, + "learning_rate": 7.013549871673736e-05, + "loss": 0.669, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.5200251141687845, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6868, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.46930700494470096, + "learning_rate": 6.964114584347316e-05, + "loss": 0.6526, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4217451501463285, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6825, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.4976416749467991, + "learning_rate": 6.914760887390452e-05, + "loss": 0.77, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4451828871570694, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6973, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.5609963646621109, + "learning_rate": 6.865490107199181e-05, + "loss": 0.6858, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.46504384233714136, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7657, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.4346667091896403, + "learning_rate": 6.816303567941112e-05, + "loss": 0.7039, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.516554360428688, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7589, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.571785898731525, + "learning_rate": 6.767202591519875e-05, + "loss": 0.7852, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3745411828670773, + "learning_rate": 6.742684601840141e-05, + "loss": 0.6745, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.5054649628331804, + "learning_rate": 6.718188497539554e-05, + "loss": 0.6905, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4619358628489545, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6517, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.5036440082809436, + "learning_rate": 6.669262603269246e-05, + "loss": 0.7575, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.49748178875585825, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7668, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.4050494646861793, + "learning_rate": 6.620426223607654e-05, + "loss": 0.6809, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.41510237254019805, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7229, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.4679126409798029, + "learning_rate": 6.571680671047749e-05, + "loss": 0.7461, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.49923074616833046, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7556, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.4683006967237606, + "learning_rate": 6.523027255641493e-05, + "loss": 0.7257, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4369295941400887, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6254, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.4169258503447942, + "learning_rate": 6.474467284964634e-05, + "loss": 0.7278, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5018850814288661, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7542, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.4095436898801255, + "learning_rate": 6.426002064081565e-05, + "loss": 0.7353, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4094236842920161, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7252, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.48128697326604447, + "learning_rate": 6.377632895510248e-05, + "loss": 0.6489, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.44410206397414576, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6078, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.4367963196859822, + "learning_rate": 6.329361079187199e-05, + "loss": 0.6302, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.42876170705834227, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6557, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.46487569496018677, + "learning_rate": 6.281187912432587e-05, + "loss": 0.6632, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4117070353675247, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6992, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.4108913694228595, + "learning_rate": 6.233114689915316e-05, + "loss": 0.7661, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.5056805355792021, + "learning_rate": 6.209115961596208e-05, + "loss": 0.78, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.4783807645427945, + "learning_rate": 6.18514270361827e-05, + "loss": 0.6945, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3964619349449278, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6853, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.48729188871411383, + "learning_rate": 6.13727324280358e-05, + "loss": 0.7326, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4332830196550788, + "learning_rate": 6.113377361594049e-05, + "loss": 0.5924, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.4069626710751544, + "learning_rate": 6.08950759397797e-05, + "loss": 0.6983, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.438367791246098, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6913, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.4302631410104595, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.7465, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.5021691106301757, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7842, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.37936693978621344, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.7254, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.44276835352005717, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6512, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.4273416655809347, + "learning_rate": 5.946846342446214e-05, + "loss": 0.6946, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.4301337196717757, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7054, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.45556920191856143, + "learning_rate": 5.899508750327501e-05, + "loss": 0.6885, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4644653362345782, + "learning_rate": 5.875881200614207e-05, + "loss": 0.7561, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.4256074124149417, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.6806, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.3926899001419619, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6499, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.38092749541547033, + "learning_rate": 5.80516544129337e-05, + "loss": 0.7165, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4405630576123063, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6989, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.41373361036064654, + "learning_rate": 5.758162259883867e-05, + "loss": 0.6885, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3889753043133514, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6932, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.3779158448959292, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.6705, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4087540029329042, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7023, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.42587075011487324, + "learning_rate": 5.664499159372017e-05, + "loss": 0.6852, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.49807996258620635, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7229, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.373270181622042, + "learning_rate": 5.617841757494762e-05, + "loss": 0.632, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.4750774107066468, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7389, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.41478198578627196, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.6885, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.41697709076204126, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7298, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.40997889032425583, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6547, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.42595443046949505, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6919, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.4272331275871181, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.69, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4301365121290822, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7551, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.46430582102646983, + "learning_rate": 5.432402360355615e-05, + "loss": 0.7078, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.43954462128486094, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6947, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.410877966121775, + "learning_rate": 5.386346293357242e-05, + "loss": 0.7663, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.45636934444526156, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6881, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.4406791030344553, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.656, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.44908779405293825, + "learning_rate": 5.31749506635086e-05, + "loss": 0.731, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.43081103370266594, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.6481, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3790940002229224, + "learning_rate": 5.271751296338823e-05, + "loss": 0.66, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.7779848932614899, + "learning_rate": 5.248926987065417e-05, + "loss": 0.6787, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3686335127303108, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6806, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.4439345348202538, + "learning_rate": 5.203374286747158e-05, + "loss": 0.6912, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4424913014595794, + "learning_rate": 5.180646201763577e-05, + "loss": 0.773, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.45621744412818693, + "learning_rate": 5.15795049724435e-05, + "loss": 0.6821, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.394941762067283, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6682, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.5333301131620256, + "learning_rate": 5.112656839335543e-05, + "loss": 0.7751, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4110730024792835, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7006, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.42121987615334794, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.652, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4200436383052014, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6618, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.4739336833529893, + "learning_rate": 5.022464783894744e-05, + "loss": 0.7237, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.47305673327349285, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7095, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.4537154952100946, + "learning_rate": 4.977568810302432e-05, + "loss": 0.7467, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4843827307605648, + "learning_rate": 4.955171365513603e-05, + "loss": 0.645, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.46432031803046925, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.7235, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.3876084772895051, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7036, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.42753374737712524, + "learning_rate": 4.88818300430819e-05, + "loss": 0.7343, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3749563864309047, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6464, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.4659616812478141, + "learning_rate": 4.843695574177737e-05, + "loss": 0.7521, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.5857125777562097, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6817, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.46210346823419846, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.6828, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.42221537021220834, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6214, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.4457048252425589, + "learning_rate": 4.755137637685979e-05, + "loss": 0.7197, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.48528059872326096, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6839, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.45852419572461484, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.7411, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.5617887402455709, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7202, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.4498211895755781, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.7038, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.42503251286810007, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6424, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.44170758653529296, + "learning_rate": 4.623360864173893e-05, + "loss": 0.62, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.35610057393796085, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6383, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.37425435234487286, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6754, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3936776282263319, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6601, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.46034838081495977, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.6943, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.4066225141906522, + "learning_rate": 4.514538954847064e-05, + "loss": 0.627, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.3858640367503925, + "learning_rate": 4.492884557078688e-05, + "loss": 0.6342, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.44831807101287013, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7039, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.46454349710235776, + "learning_rate": 4.449686911058992e-05, + "loss": 0.7448, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.43185515154717197, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6478, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.4039925961968265, + "learning_rate": 4.406638431438576e-05, + "loss": 0.6721, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4685884685660754, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7064, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.437495368402922, + "learning_rate": 4.36374027515878e-05, + "loss": 0.697, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.5013609622097205, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6348, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.3928807046293955, + "learning_rate": 4.320993595120969e-05, + "loss": 0.6506, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.41851771065401694, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.695, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.4148397310479281, + "learning_rate": 4.278399540155536e-05, + "loss": 0.6843, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.34632427812634764, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6448, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.4434044271178348, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.6966, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.4185228933267323, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6827, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.38459801766096763, + "learning_rate": 4.193673880223339e-05, + "loss": 0.6889, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3943276928660913, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6177, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.41176764896341617, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.6971, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.38025970545569326, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6455, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.4420415156862601, + "learning_rate": 4.109572403415386e-05, + "loss": 0.6057, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.43611147714187426, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6748, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.397038492096301, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.6918, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.4664109049186901, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7114, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.39762390801481073, + "learning_rate": 4.026104150684835e-05, + "loss": 0.6474, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.58165867026539, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6996, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.37818800588359214, + "learning_rate": 3.984610290059467e-05, + "loss": 0.6581, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.41370612659127415, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6719, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.4234246150614126, + "learning_rate": 3.943278094912946e-05, + "loss": 0.6955, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.45271432227329883, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7278, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.3896901191977369, + "learning_rate": 3.902108676060937e-05, + "loss": 0.6036, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.5609736262184741, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7335, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.38077211332926136, + "learning_rate": 3.861103139944449e-05, + "loss": 0.6798, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.4064709776626615, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6812, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.37490874266779545, + "learning_rate": 3.820262588600074e-05, + "loss": 0.6342, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.4330143977466438, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6877, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.39609531533999726, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.6796, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5746872198742553, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7169, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.6955975779348033, + "learning_rate": 3.739080826174498e-05, + "loss": 0.6939, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4583001053565282, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6649, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.335961766805353, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.6217, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4625730889324945, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7732, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.44874139521442424, + "learning_rate": 3.658572115866541e-05, + "loss": 0.7845, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.4216141196611632, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6761, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.44065164143016794, + "learning_rate": 3.618572862711247e-05, + "loss": 0.6558, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4864648977419046, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7345, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.49567456143923777, + "learning_rate": 3.578745112405083e-05, + "loss": 0.6331, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.38520363824229475, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6846, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.4298873914826667, + "learning_rate": 3.539089935331294e-05, + "loss": 0.7205, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4182063839645925, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6546, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.7984264772972497, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.7568, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.45938886079604774, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6834, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.3602631663968547, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.6831, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.45573146538831666, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7273, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.4411017629549524, + "learning_rate": 3.421170477595419e-05, + "loss": 0.7174, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4115477835089943, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7038, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.4397253268529848, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.6749, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4153110037054647, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6694, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.45330892199582196, + "learning_rate": 3.34343978560367e-05, + "loss": 0.7337, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.45014042080415023, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7086, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.5453929600104398, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.6513, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3941390107280701, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6532, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.46690009399066584, + "learning_rate": 3.266424677350346e-05, + "loss": 0.6889, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.4544476435460685, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6279, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.4666156504306541, + "learning_rate": 3.228188057393895e-05, + "loss": 0.7089, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4000486478664046, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6598, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.3959166116092314, + "learning_rate": 3.190133432000252e-05, + "loss": 0.6406, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4999745227355347, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7318, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.5220824551660728, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.7625, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.39539992669258, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6679, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.5036688882847062, + "learning_rate": 3.114574250902558e-05, + "loss": 0.7887, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.48227512199081035, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7103, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.5949449071155793, + "learning_rate": 3.077071725875116e-05, + "loss": 0.7775, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.40727637141086087, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6921, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.4009110429283926, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.674, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.3697649649258211, + "learning_rate": 3.021167106673928e-05, + "loss": 0.64, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.4667293936982586, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.7639, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.39096891763990427, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6833, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.4295317564427617, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.6138, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.49473067071643123, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6537, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.43121447094036847, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.749, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3945702962951126, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6462, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.4256126539096834, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.6409, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.4209330020909794, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7048, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.44465171515956664, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.7486, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.470146543029111, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6958, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.3430542192799981, + "learning_rate": 2.819819423336775e-05, + "loss": 0.6681, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4278662707906049, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6733, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.46376730813190903, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.7474, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4520046856226995, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6573, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.32907579332774634, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.5786, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.42564964837736396, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.668, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.3959031941752177, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6476, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.4029997278045812, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6632, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.43738493230571895, + "learning_rate": 2.677041764010988e-05, + "loss": 0.6769, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.45933231703250693, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6613, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.39551741707098925, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.6997, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4034196705758546, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6626, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.4129745482055319, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.6623, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.5171749789515476, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6979, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.4285692421209914, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.6502, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4081031515560949, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6701, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.3674226067898384, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.6832, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.3756956369593102, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6561, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.4487599902084762, + "learning_rate": 2.503004759861258e-05, + "loss": 0.6587, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.46317087209149344, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6616, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.46869974121680025, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.6819, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3929096221124496, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6349, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.38727743479523596, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.6367, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4463347660896269, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6847, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.3458659314793, + "learning_rate": 2.400992893100822e-05, + "loss": 0.6058, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4659795346966299, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6699, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.43772876372166897, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.6882, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.41809773512693893, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7028, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.4515957698474849, + "learning_rate": 2.334004587234717e-05, + "loss": 0.6731, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5262501752393429, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7666, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.36199340044564093, + "learning_rate": 2.300819024631603e-05, + "loss": 0.716, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.44144751312494, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6346, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.4342055593224267, + "learning_rate": 2.26784037992395e-05, + "loss": 0.7005, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3626795796653051, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6281, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.4388778436201537, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.6291, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4350075171081566, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7171, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.46335806131502627, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.7209, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.404156302898499, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6586, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.4560385946479281, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.7372, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3512054610727605, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.583, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.5163389460296613, + "learning_rate": 2.138012622361689e-05, + "loss": 0.7242, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.40836637784071556, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6489, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.38595969998255003, + "learning_rate": 2.106081749751897e-05, + "loss": 0.6705, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.44458769974886425, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7151, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.387572491963222, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.6634, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.39831307254179926, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6851, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.4326793722012506, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.6522, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.3761518009550033, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6898, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.5318168930636895, + "learning_rate": 2.011565445123711e-05, + "loss": 0.7398, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4402774976471995, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6764, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.3636615923719273, + "learning_rate": 1.980488270378612e-05, + "loss": 0.5973, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5615009760015723, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7686, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.5488451248282121, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.6594, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.5068495893501076, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7588, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.6407632462410784, + "learning_rate": 1.918981330958678e-05, + "loss": 0.7611, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4900238542939908, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6692, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.4458753891615236, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.7154, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.4268728281983383, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7403, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.4163307353758894, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.6751, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3790711693886177, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6714, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.43613734243006946, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.6329, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.416625334207955, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.582, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.3573621377255294, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.6391, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.44838677241013075, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6694, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.49598155654857595, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.7922, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.4195459170668918, + "learning_rate": 1.754336106761927e-05, + "loss": 0.665, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.40749648619205064, + "learning_rate": 1.739698775823442e-05, + "loss": 0.6148, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.4928095362771939, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.741, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.42196252796864997, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.648, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.407490799028459, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6745, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.4430637115129798, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.6647, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.41572550766040145, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6626, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.48442852541343595, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.7474, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.39576276773544145, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6829, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.40957410219070856, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.6913, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.40815395918551955, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6332, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.45848707393438665, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.6935, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.3955987497629303, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6329, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.48396955421661225, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.6775, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.37748761806878395, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6406, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.35216481069312683, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.6134, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.44214502121194277, + "learning_rate": 1.526852950422226e-05, + "loss": 0.644, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.4885588405453753, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.6568, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4203449899965025, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6871, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.3679142957176437, + "learning_rate": 1.485810737340767e-05, + "loss": 0.6178, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.4319693861040839, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.635, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.40801722426942594, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.6086, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.4317790754967962, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.7299, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.39203068360957366, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.6272, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.42847622643373917, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7315, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.4515897736109138, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.6656, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.46418313495739943, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7922, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.4190693005291634, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.6779, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4748448416607454, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6192, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.4570517756847288, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.6838, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.43346042947082064, + "learning_rate": 1.339745962155613e-05, + "loss": 0.627, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.6083316409096777, + "learning_rate": 1.326814704364262e-05, + "loss": 0.7133, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.4149318177968615, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6812, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.40279033289445737, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.6491, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3661942288150413, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6409, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.3492518693723157, + "learning_rate": 1.275673273546758e-05, + "loss": 0.6449, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.38664075688037525, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6476, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.3732950185886913, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.6451, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.40586156896060066, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6676, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.39914721626542893, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.6688, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.5858423597476015, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7109, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.39004996372302436, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.6711, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.4422846916746057, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7074, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.38360376191728496, + "learning_rate": 1.176209418012495e-05, + "loss": 0.6687, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4103838694367495, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6366, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.4543607701441768, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.6662, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.481168479070166, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7172, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.3571374357856971, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.5997, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.4656901277210353, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6883, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.3883852086224729, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.6142, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.46282118180927284, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6742, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.44416827498862815, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.7332, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.3764661067145374, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6429, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.462275565700437, + "learning_rate": 1.057219974130903e-05, + "loss": 0.6839, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.4730388536693979, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6604, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.4795687153560455, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.7347, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4716673153974263, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.696, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.4664103686844258, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.7794, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5018422631057337, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7322, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.42993250880327977, + "learning_rate": 9.887052838721322e-06, + "loss": 0.6518, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.44171724743541363, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7116, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.4419988093536476, + "learning_rate": 9.663506046162985e-06, + "loss": 0.6499, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.5430040198384588, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6606, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.4258664961000874, + "learning_rate": 9.44238707511862e-06, + "loss": 0.6453, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.4620311190948906, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7282, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.474533642643054, + "learning_rate": 9.22370186822965e-06, + "loss": 0.7126, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.4419312126447903, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6813, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.4726161631102984, + "learning_rate": 9.0074563027294e-06, + "loss": 0.6794, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3932056993668791, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6492, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.38928214123694843, + "learning_rate": 8.79365619028507e-06, + "loss": 0.6556, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4518171591491437, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7284, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.44001482188386976, + "learning_rate": 8.582307276841462e-06, + "loss": 0.6851, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.40605390396561136, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7082, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.47335094277664147, + "learning_rate": 8.37341524246672e-06, + "loss": 0.7335, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3744189862391875, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6864, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.45417328532589746, + "learning_rate": 8.166985701199582e-06, + "loss": 0.6185, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.43987961812149784, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7014, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.42984884893442815, + "learning_rate": 7.963024200898462e-06, + "loss": 0.716, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.47596598245542965, + "learning_rate": 7.861970681683051e-06, + "loss": 0.761, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.39859236660861885, + "learning_rate": 7.761536223092458e-06, + "loss": 0.6249, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.4176226059997992, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6545, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.4091784905788249, + "learning_rate": 7.562527182833978e-06, + "loss": 0.6413, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.4479446913269882, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6985, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.4274965995923488, + "learning_rate": 7.366002428553153e-06, + "loss": 0.6279, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.4175199943420985, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.7025, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.3692704830319196, + "learning_rate": 7.171967241914224e-06, + "loss": 0.5538, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.41193975984275244, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6576, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.3879903506222446, + "learning_rate": 6.980426837673437e-06, + "loss": 0.5984, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4788245547001526, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.7042, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.3825999151613301, + "learning_rate": 6.791386363539065e-06, + "loss": 0.6431, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3874923256907664, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7392, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.49706772007250527, + "learning_rate": 6.604850900032955e-06, + "loss": 0.6588, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.446642706622744, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6563, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.43648055985384054, + "learning_rate": 6.420825460353974e-06, + "loss": 0.675, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4487207757830329, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6355, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.6904081084293971, + "learning_rate": 6.239314990243339e-06, + "loss": 0.7009, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.374706566591849, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6127, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.409270401776318, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.7169, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.5035641222209561, + "learning_rate": 5.971775505458444e-06, + "loss": 0.656, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.4707754134426772, + "learning_rate": 5.883858403607967e-06, + "loss": 0.7027, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.4249444770999359, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6529, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.3998860715897474, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.6776, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.4674436263252115, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6552, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.44788320358106837, + "learning_rate": 5.538519351897575e-06, + "loss": 0.6159, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.41531233991063055, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6908, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.393963733771613, + "learning_rate": 5.369655545525909e-06, + "loss": 0.6512, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.44347314190488557, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6791, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.41861294210651295, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.7193, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.47729281110819066, + "learning_rate": 5.121129773156663e-06, + "loss": 0.643, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.3943339306536462, + "learning_rate": 5.039562062965508e-06, + "loss": 0.6673, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.44250450829581267, + "learning_rate": 4.95863237670956e-06, + "loss": 0.69, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.43638229451147903, + "learning_rate": 4.87834125814235e-06, + "loss": 0.6687, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.47806195481300895, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6697, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.37866928780170195, + "learning_rate": 4.719676877632639e-06, + "loss": 0.6488, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4270131231094788, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6531, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.4239074659792173, + "learning_rate": 4.563573185591219e-06, + "loss": 0.672, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.4010380547708696, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6814, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.38254446097538725, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.59, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3816544876495538, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6516, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.4149851969056768, + "learning_rate": 4.259064579323302e-06, + "loss": 0.6342, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4939913273078448, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7097, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.4473620301826761, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.6842, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.347898190178297, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6866, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.4677631604683346, + "learning_rate": 3.964848174174541e-06, + "loss": 0.7153, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.36857826653537523, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6308, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.4208952222294186, + "learning_rate": 3.821609474213983e-06, + "loss": 0.6234, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3615947693575601, + "learning_rate": 3.750959195463466e-06, + "loss": 0.613, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.3756762767045553, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.6585, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4327928230702486, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6826, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.4042377275237032, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.7428, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.431581157071277, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6422, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.3996661084377291, + "learning_rate": 3.40741737109318e-06, + "loss": 0.6932, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.3986990368694086, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6977, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.4280089557109094, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.7035, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4437933253446788, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6996, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.4337974583319791, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.6574, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.5209740419143642, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6811, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.5128391656199703, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.6759, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5064398613672889, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.711, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.4040717907377643, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.6127, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4900021425263775, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6503, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.40148166495479204, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6751, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.478024833109711, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7441, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.3966442014026454, + "learning_rate": 2.649217248223468e-06, + "loss": 0.6398, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4346210876122322, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6345, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.4518751493966605, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6722, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.45645819261345977, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7766, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.3906524036530478, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.6913, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.43531277252955825, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7008, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.3861596842374385, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.6621, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3889786430539552, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6828, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.4686956061808216, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.7167, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.4139587613272724, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5721, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.3707594717031069, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.6226, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.41238141651385263, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7213, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.38596134444921354, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.6312, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.5294431848347445, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7428, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.3708889449347871, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.647, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.39920534073294506, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6604, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.39860181638799774, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.6281, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.46231369728897465, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.7061, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.45745721560795477, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6742, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.4689754580880514, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.7039, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.4823618491509199, + "learning_rate": 1.595161589389449e-06, + "loss": 0.662, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.46336471271106144, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.673, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.35396268660446345, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.6535, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.5057640826513973, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6975, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.3795138987493162, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.6151, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.44355391727254384, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6406, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.5399971038799832, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.6719, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.42414602522240696, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6897, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.37502017005224747, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.6794, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.4314696948175594, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6796, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.34774182154981, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.6331, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4168739496808594, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6785, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.3909911905539331, + "learning_rate": 1.089491988176017e-06, + "loss": 0.6534, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.5097445480440692, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.7623, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.4059124588275709, + "learning_rate": 1.014505010326583e-06, + "loss": 0.6667, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.43295360086793055, + "learning_rate": 9.780089980330642e-07, + "loss": 0.7039, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.3585355316944381, + "learning_rate": 9.421782985976068e-07, + "loss": 0.6358, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.38145025657316156, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6338, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.40944540455644174, + "learning_rate": 8.725137967920738e-07, + "loss": 0.7277, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.4001505668384557, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6646, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.4297407107858297, + "learning_rate": 8.055133771652345e-07, + "loss": 0.7087, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.4628818699611473, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6693, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.46878217054932964, + "learning_rate": 7.411788403743237e-07, + "loss": 0.6508, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4662714739357374, + "learning_rate": 7.100118211581852e-07, + "loss": 0.7321, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.4341883993170443, + "learning_rate": 6.7951191543012e-07, + "loss": 0.7093, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.42279216696853417, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7288, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.33331493654272815, + "learning_rate": 6.205142596505176e-07, + "loss": 0.5991, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.43282013465082525, + "learning_rate": 5.920169059947411e-07, + "loss": 0.625, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.4787444650581861, + "learning_rate": 5.64187458615939e-07, + "loss": 0.6435, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.45048649980398436, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6765, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.4651771376062177, + "learning_rate": 5.105330261267916e-07, + "loss": 0.6223, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4224991553320907, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6566, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.3582193752613605, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.6721, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4734287505841682, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.7126, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.45700700134634636, + "learning_rate": 4.112469628438365e-07, + "loss": 0.6413, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.44247153043906856, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6816, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.47470973799676847, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.7297, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.5446957137639532, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.643, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.46334686798287195, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.7692, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.41091671416864994, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6502, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.39459758391103256, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.6413, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4864800116717401, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6379, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.416298793772082, + "learning_rate": 2.448018893333681e-07, + "loss": 0.6878, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.36409187772858065, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6677, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.4101215847095293, + "learning_rate": 2.098903854912515e-07, + "loss": 0.6696, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3847472437653507, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.5913, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.4247932929572247, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.5759, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3553625121999897, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.635, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.43164768345714893, + "learning_rate": 1.481139151579991e-07, + "loss": 0.6749, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.46418670622765706, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6552, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.46037860768689964, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6209, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3954814721452663, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.669, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.4632088681723823, + "learning_rate": 9.707157531134713e-08, + "loss": 0.7539, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.3612381684901188, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7106, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.570964249199408, + "learning_rate": 7.557746412468758e-08, + "loss": 0.7016, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.43290226218463623, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6845, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.4336396133365297, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6387, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.34928051538481464, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6331, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.43635113247538154, + "learning_rate": 4.064624751394242e-08, + "loss": 0.7015, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.44316229365990106, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6052, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.4277488879744065, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.6274, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4713402250138666, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7099, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.37742729038730555, + "learning_rate": 1.646071422083395e-08, + "loss": 0.6548, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.42519893779646084, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6707, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.39409367052304717, + "learning_rate": 8.398436437317969e-09, + "loss": 0.7144, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4200414963488034, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6496, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.4802378502631879, + "learning_rate": 3.023464202944748e-09, + "loss": 0.6364, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4196396964710776, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5843, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.48797004369630237, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.6952, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.38172998753419185, + "learning_rate": 0.0, + "loss": 0.6778, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1075826133532672.0, + "train_loss": 0.7417976837635041, + "train_runtime": 19115.8196, + "train_samples_per_second": 1.046, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1075826133532672.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1f292a7f1683eaf79bddef8cffe2dee5554e074 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "k_proj", + "o_proj", + "gate_proj", + "up_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cd7dd1d66ff4ab5dc97a6609c1dc54722fe489a9 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:085d39296c22c360e7c4d534d4f697116d34e17d698aae4ec57a6b035f902fbf +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1a629de374a3741bbd76d893f1c4b600a0de224 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27bbfba65972d389b8ad7ee2efb761df350c0f7a182b5f4e76e444e472720d34 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..67a95898d230bff50174be6b7bb656fbfec7ca45 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.9795816727549139, + "learning_rate": 5.263157894736842e-06, + "loss": 1.3776, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 1.001400272852517, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.3593, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 1.386084578439465, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.4641, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8968642175430531, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2433, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.9172606992324467, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.3735, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 1.0104451804190506, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3595, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.7202998913128105, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.0944, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.909331910756895, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1729, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.7677918672527282, + "learning_rate": 4.736842105263158e-05, + "loss": 1.0781, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 1.320728027486979, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.1596, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.8528174600822885, + "learning_rate": 5.789473684210527e-05, + "loss": 1.0121, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7537754667752942, + "learning_rate": 6.31578947368421e-05, + "loss": 0.995, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8658238321920337, + "learning_rate": 6.842105263157895e-05, + "loss": 1.0719, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 1.120365218132729, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9672, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.8933667962965879, + "learning_rate": 7.894736842105263e-05, + "loss": 0.9901, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6626033473553431, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9127, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.6598743400498236, + "learning_rate": 8.947368421052632e-05, + "loss": 0.9322, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5687357947833772, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8733, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.5745221794718155, + "learning_rate": 0.0001, + "loss": 0.9183, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.5335547076637125, + "learning_rate": 0.00010526315789473685, + "loss": 0.8869, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6527646967762165, + "learning_rate": 0.0001105263157894737, + "loss": 0.9387, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6029701200334054, + "learning_rate": 0.00011578947368421053, + "loss": 0.9282, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5356669071254307, + "learning_rate": 0.00012105263157894738, + "loss": 0.8965, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.4468237694215002, + "learning_rate": 0.0001263157894736842, + "loss": 0.8501, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.6820914041506152, + "learning_rate": 0.00013157894736842108, + "loss": 0.8875, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5468818728749623, + "learning_rate": 0.0001368421052631579, + "loss": 0.894, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.5108690786169384, + "learning_rate": 0.00014210526315789474, + "loss": 0.9317, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5823547589849747, + "learning_rate": 0.00014736842105263158, + "loss": 0.9312, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.6348175797361912, + "learning_rate": 0.00015263157894736845, + "loss": 1.0034, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.5036760470288159, + "learning_rate": 0.00015789473684210527, + "loss": 0.8095, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.5086285965001636, + "learning_rate": 0.0001631578947368421, + "loss": 0.8288, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4810370375467901, + "learning_rate": 0.00016842105263157895, + "loss": 0.9338, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.5120024960432132, + "learning_rate": 0.0001736842105263158, + "loss": 0.8819, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5024966363071183, + "learning_rate": 0.00017894736842105264, + "loss": 0.8253, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.5239072940748722, + "learning_rate": 0.00018421052631578948, + "loss": 0.8691, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5230322392779483, + "learning_rate": 0.00018947368421052632, + "loss": 0.9452, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.48779597363311394, + "learning_rate": 0.00019473684210526317, + "loss": 0.8819, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.4752187441381419, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.48115741632287223, + "learning_rate": 0.00019999966405802826, + "loss": 0.8083, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.5849041040140422, + "learning_rate": 0.00019999865623437013, + "loss": 0.8825, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.4725278601678821, + "learning_rate": 0.00019999697653579705, + "loss": 0.9055, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5523659132085241, + "learning_rate": 0.00019999462497359466, + "loss": 0.8939, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.4638269541138185, + "learning_rate": 0.0001999916015635627, + "loss": 0.7956, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6416053734788665, + "learning_rate": 0.00019998790632601496, + "loss": 0.8854, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.46754119288552975, + "learning_rate": 0.00019998353928577919, + "loss": 0.8075, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.49357320270679894, + "learning_rate": 0.0001999785004721968, + "loss": 0.8955, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.5265191782061447, + "learning_rate": 0.0001999727899191228, + "loss": 0.873, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5440466788183455, + "learning_rate": 0.00019996640766492543, + "loss": 0.9327, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.5517236902037315, + "learning_rate": 0.00019995935375248606, + "loss": 0.8772, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.43798313657061366, + "learning_rate": 0.00019995162822919883, + "loss": 0.8085, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.4508153636398181, + "learning_rate": 0.00019994323114697022, + "loss": 0.8308, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5119638374510451, + "learning_rate": 0.00019993416256221895, + "loss": 0.8612, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.5382792357634911, + "learning_rate": 0.0001999244225358753, + "loss": 0.8798, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.6076236517255771, + "learning_rate": 0.00019991401113338104, + "loss": 0.9024, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.441952994803446, + "learning_rate": 0.00019990292842468868, + "loss": 0.8043, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6106597165054768, + "learning_rate": 0.00019989117448426108, + "loss": 0.9344, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.4406300276624783, + "learning_rate": 0.0001998787493910712, + "loss": 0.7952, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.564217760542486, + "learning_rate": 0.00019986565322860115, + "loss": 0.7998, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.48469011133740136, + "learning_rate": 0.000199851886084842, + "loss": 0.8903, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.5288039702729751, + "learning_rate": 0.00019983744805229296, + "loss": 0.9089, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.48006551682464205, + "learning_rate": 0.00019982233922796085, + "loss": 0.8082, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5505340709331455, + "learning_rate": 0.00019980655971335945, + "loss": 0.848, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.5503316668230102, + "learning_rate": 0.00019979010961450878, + "loss": 0.9087, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4761815526904352, + "learning_rate": 0.00019977298904193437, + "loss": 0.8735, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.5228152494367051, + "learning_rate": 0.00019975519811066663, + "loss": 0.8651, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.48727193746734115, + "learning_rate": 0.00019973673694024, + "loss": 0.8213, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.4541746743520036, + "learning_rate": 0.0001997176056546921, + "loss": 0.7564, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.606930240393257, + "learning_rate": 0.00019969780438256293, + "loss": 0.9255, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.5068161831554241, + "learning_rate": 0.0001996773332568941, + "loss": 0.8169, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.424067157048311, + "learning_rate": 0.0001996561924152278, + "loss": 0.7367, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.6252522883664977, + "learning_rate": 0.00019963438199960599, + "loss": 0.8798, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5223020930728051, + "learning_rate": 0.0001996119021565693, + "loss": 0.8766, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.4941228814132986, + "learning_rate": 0.00019958875303715615, + "loss": 0.834, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4873684591333978, + "learning_rate": 0.0001995649347969019, + "loss": 0.8248, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.5963432196918743, + "learning_rate": 0.0001995404475958373, + "loss": 0.9005, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.45748157914102094, + "learning_rate": 0.00019951529159848805, + "loss": 0.7965, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.45401161358757675, + "learning_rate": 0.0001994894669738732, + "loss": 0.7329, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4519919461414791, + "learning_rate": 0.00019946297389550433, + "loss": 0.8374, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.5153754388703433, + "learning_rate": 0.0001994358125413841, + "loss": 0.8594, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.4650985354020782, + "learning_rate": 0.00019940798309400526, + "loss": 0.8447, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.5124898203889733, + "learning_rate": 0.0001993794857403495, + "loss": 0.8575, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.3987512573205426, + "learning_rate": 0.0001993503206718859, + "loss": 0.7722, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.5263385255738214, + "learning_rate": 0.0001993204880845699, + "loss": 0.8442, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.42770353755918095, + "learning_rate": 0.00019928998817884182, + "loss": 0.7781, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.5136893553522667, + "learning_rate": 0.00019925882115962568, + "loss": 0.8361, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5308907844287711, + "learning_rate": 0.00019922698723632767, + "loss": 0.8802, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.6081975357536167, + "learning_rate": 0.00019919448662283478, + "loss": 0.8561, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5131090276495259, + "learning_rate": 0.00019916131953751342, + "loss": 0.8325, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.4584634943727301, + "learning_rate": 0.00019912748620320794, + "loss": 0.8573, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.4458868662131762, + "learning_rate": 0.00019909298684723904, + "loss": 0.7505, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.4795068156261207, + "learning_rate": 0.00019905782170140238, + "loss": 0.7912, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.454801285925989, + "learning_rate": 0.00019902199100196697, + "loss": 0.7622, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.5133998231490974, + "learning_rate": 0.00019898549498967343, + "loss": 0.8776, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.5023671300581364, + "learning_rate": 0.00019894833390973266, + "loss": 0.8506, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.4747696577123322, + "learning_rate": 0.000198910508011824, + "loss": 0.8295, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.7440171252294648, + "learning_rate": 0.00019887201755009357, + "loss": 0.9612, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.47532276432413834, + "learning_rate": 0.00019883286278315262, + "loss": 0.7502, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5933290446396249, + "learning_rate": 0.0001987930439740757, + "loss": 0.854, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.45899362568312485, + "learning_rate": 0.00019875256139039902, + "loss": 0.8233, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.5100520778137256, + "learning_rate": 0.00019871141530411853, + "loss": 0.8262, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.5061044066072292, + "learning_rate": 0.00019866960599168826, + "loss": 0.8266, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4644796197561921, + "learning_rate": 0.0001986271337340182, + "loss": 0.8673, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.532889532692695, + "learning_rate": 0.0001985839988164726, + "loss": 0.8944, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5402398258930571, + "learning_rate": 0.00019854020152886814, + "loss": 0.8318, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.4315294793014923, + "learning_rate": 0.00019849574216547171, + "loss": 0.7183, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5059474728944265, + "learning_rate": 0.0001984506210249986, + "loss": 0.8116, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.7754971525338998, + "learning_rate": 0.00019840483841061058, + "loss": 0.7798, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5170633989504952, + "learning_rate": 0.00019835839462991361, + "loss": 0.8156, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.578973362036397, + "learning_rate": 0.00019831128999495606, + "loss": 0.9072, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.46987237459032416, + "learning_rate": 0.00019826352482222638, + "loss": 0.832, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.5289940796495135, + "learning_rate": 0.0001982150994326511, + "loss": 0.8089, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5130783014268392, + "learning_rate": 0.00019816601415159263, + "loss": 0.7312, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.5204585053542395, + "learning_rate": 0.0001981162693088471, + "loss": 0.8148, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.49355582746057164, + "learning_rate": 0.0001980658652386421, + "loss": 0.8228, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.49244184486559767, + "learning_rate": 0.0001980148022796345, + "loss": 0.7831, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4856545194485971, + "learning_rate": 0.00019796308077490817, + "loss": 0.7825, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.48638246715216615, + "learning_rate": 0.00019791070107197153, + "loss": 0.7993, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.45328815654698607, + "learning_rate": 0.00019785766352275542, + "loss": 0.8438, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.4699347966122583, + "learning_rate": 0.0001978039684836106, + "loss": 0.8007, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.954925514227352, + "learning_rate": 0.00019774961631530545, + "loss": 0.993, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.4563360256347201, + "learning_rate": 0.0001976946073830234, + "loss": 0.7498, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4660893825770606, + "learning_rate": 0.00019763894205636072, + "loss": 0.8683, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.5353770623306547, + "learning_rate": 0.00019758262070932375, + "loss": 0.912, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4549228446911368, + "learning_rate": 0.00019752564372032657, + "loss": 0.7854, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.503929509008853, + "learning_rate": 0.00019746801147218842, + "loss": 0.8639, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.5357250808655094, + "learning_rate": 0.00019740972435213115, + "loss": 0.8876, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.4660867449746972, + "learning_rate": 0.00019735078275177654, + "loss": 0.8103, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.45966669204874755, + "learning_rate": 0.00019729118706714375, + "loss": 0.7822, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.42467515154790736, + "learning_rate": 0.00019723093769864663, + "loss": 0.7763, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.5278863210130068, + "learning_rate": 0.00019717003505109095, + "loss": 0.8289, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.6213394450284235, + "learning_rate": 0.0001971084795336719, + "loss": 0.7384, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5198214516650237, + "learning_rate": 0.00019704627155997108, + "loss": 0.7867, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.4815304018541252, + "learning_rate": 0.00019698341154795389, + "loss": 0.7645, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5445023318738487, + "learning_rate": 0.00019691989991996663, + "loss": 0.9136, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.44610566324522344, + "learning_rate": 0.00019685573710273376, + "loss": 0.781, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.6020596210092647, + "learning_rate": 0.0001967909235273549, + "loss": 0.9109, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.4769291715566396, + "learning_rate": 0.00019672545962930215, + "loss": 0.8249, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5547657722340099, + "learning_rate": 0.00019665934584841682, + "loss": 0.7991, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.5051002370148648, + "learning_rate": 0.00019659258262890683, + "loss": 0.8698, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.4916122523523175, + "learning_rate": 0.00019652517041934356, + "loss": 0.8153, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.5166782007595769, + "learning_rate": 0.00019645710967265882, + "loss": 0.7959, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.44309227763417436, + "learning_rate": 0.00019638840084614182, + "loss": 0.7681, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.46787767001269476, + "learning_rate": 0.00019631904440143612, + "loss": 0.8047, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5048137999449103, + "learning_rate": 0.00019624904080453655, + "loss": 0.7741, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.4115538527551283, + "learning_rate": 0.00019617839052578603, + "loss": 0.6979, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.47580725304964017, + "learning_rate": 0.00019610709403987246, + "loss": 0.8439, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.4284484275975892, + "learning_rate": 0.0001960351518258255, + "loss": 0.7631, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4575503969957535, + "learning_rate": 0.00019596256436701324, + "loss": 0.7662, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.4787216610881153, + "learning_rate": 0.00019588933215113926, + "loss": 0.8322, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.472925175131021, + "learning_rate": 0.000195815455670239, + "loss": 0.7471, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.45698351899719436, + "learning_rate": 0.00019574093542067673, + "loss": 0.7955, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5035216462112196, + "learning_rate": 0.00019566577190314197, + "loss": 0.7837, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.47296199423739266, + "learning_rate": 0.0001955899656226464, + "loss": 0.833, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4647638019652673, + "learning_rate": 0.0001955135170885202, + "loss": 0.7565, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.4831036228689567, + "learning_rate": 0.0001954364268144088, + "loss": 0.7574, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.44156563283458045, + "learning_rate": 0.00019535869531826937, + "loss": 0.7636, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.44245203192588406, + "learning_rate": 0.00019528032312236736, + "loss": 0.7123, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.582444811054162, + "learning_rate": 0.00019520131075327298, + "loss": 0.7096, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.39873187756149414, + "learning_rate": 0.00019512165874185767, + "loss": 0.7913, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.41708637739457183, + "learning_rate": 0.00019504136762329047, + "loss": 0.7844, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.4446437065954701, + "learning_rate": 0.0001949604379370345, + "loss": 0.7571, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4619420767214924, + "learning_rate": 0.00019487887022684336, + "loss": 0.8421, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.4581600049161185, + "learning_rate": 0.00019479666504075736, + "loss": 0.8037, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4795136121564061, + "learning_rate": 0.00019471382293110003, + "loss": 0.8403, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.4428534252300689, + "learning_rate": 0.0001946303444544741, + "loss": 0.7776, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5381828922235592, + "learning_rate": 0.00019454623017175812, + "loss": 0.8372, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.44833639039360296, + "learning_rate": 0.00019446148064810242, + "loss": 0.8648, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4282405270094912, + "learning_rate": 0.00019437609645292546, + "loss": 0.7795, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.4774713545866125, + "learning_rate": 0.00019429007815990993, + "loss": 0.8042, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.410935491785557, + "learning_rate": 0.0001942034263469989, + "loss": 0.7735, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.4759143841412201, + "learning_rate": 0.00019411614159639204, + "loss": 0.774, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5206608221789917, + "learning_rate": 0.00019402822449454153, + "loss": 0.7397, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.5960796914978171, + "learning_rate": 0.00019393967563214833, + "loss": 0.7975, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.4871539656184539, + "learning_rate": 0.00019385049560415794, + "loss": 0.7976, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.44498736464472977, + "learning_rate": 0.00019376068500975667, + "loss": 0.7699, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4150771843721881, + "learning_rate": 0.00019367024445236754, + "loss": 0.78, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.44553083248616643, + "learning_rate": 0.000193579174539646, + "loss": 0.7706, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4224036590709276, + "learning_rate": 0.00019348747588347637, + "loss": 0.7666, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.42371940924346846, + "learning_rate": 0.00019339514909996706, + "loss": 0.7901, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.591708075847747, + "learning_rate": 0.00019330219480944694, + "loss": 0.8886, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.44833292498327687, + "learning_rate": 0.00019320861363646095, + "loss": 0.7761, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.47086220959100905, + "learning_rate": 0.00019311440620976597, + "loss": 0.8488, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.4262782763041816, + "learning_rate": 0.00019301957316232658, + "loss": 0.776, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5214668899140502, + "learning_rate": 0.0001929241151313108, + "loss": 0.8112, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.5001587396984958, + "learning_rate": 0.0001928280327580858, + "loss": 0.8792, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.4979113579842723, + "learning_rate": 0.00019273132668821364, + "loss": 0.8363, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.4497772642309642, + "learning_rate": 0.00019263399757144683, + "loss": 0.788, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.45321282554780185, + "learning_rate": 0.00019253604606172417, + "loss": 0.8022, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.4231460933527123, + "learning_rate": 0.000192437472817166, + "loss": 0.7393, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.4640456987297124, + "learning_rate": 0.00019233827850007027, + "loss": 0.8033, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.5859044824831487, + "learning_rate": 0.00019223846377690754, + "loss": 0.8414, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.43572744793730855, + "learning_rate": 0.00019213802931831696, + "loss": 0.7739, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.5226529387531288, + "learning_rate": 0.00019203697579910154, + "loss": 0.7577, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4497304299400067, + "learning_rate": 0.00019193530389822363, + "loss": 0.7549, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.48876913351196766, + "learning_rate": 0.00019183301429880043, + "loss": 0.7676, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.45111221462046336, + "learning_rate": 0.00019173010768809933, + "loss": 0.7981, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.4585065393312021, + "learning_rate": 0.00019162658475753327, + "loss": 0.792, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4674904268057127, + "learning_rate": 0.0001915224462026563, + "loss": 0.8676, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.47586101388093477, + "learning_rate": 0.00019141769272315858, + "loss": 0.7624, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.47274832683360907, + "learning_rate": 0.00019131232502286188, + "loss": 0.8281, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.4276018360251776, + "learning_rate": 0.00019120634380971496, + "loss": 0.711, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4590346746005837, + "learning_rate": 0.0001910997497957885, + "loss": 0.8043, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.4816141964583813, + "learning_rate": 0.0001909925436972706, + "loss": 0.734, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5692250295310609, + "learning_rate": 0.00019088472623446183, + "loss": 0.8311, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.5168964734560001, + "learning_rate": 0.00019077629813177036, + "loss": 0.8003, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.4434269203102765, + "learning_rate": 0.00019066726011770726, + "loss": 0.7309, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.556711306277704, + "learning_rate": 0.00019055761292488142, + "loss": 0.6997, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4805482120156597, + "learning_rate": 0.0001904473572899947, + "loss": 0.7914, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.43647431344551174, + "learning_rate": 0.00019033649395383702, + "loss": 0.7815, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.5744012343987621, + "learning_rate": 0.00019022502366128135, + "loss": 0.8203, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.5684780483215756, + "learning_rate": 0.00019011294716127867, + "loss": 0.8015, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4303508524158662, + "learning_rate": 0.00019000026520685302, + "loss": 0.7652, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.4896619399112934, + "learning_rate": 0.0001898869785550963, + "loss": 0.8379, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.5320091111275196, + "learning_rate": 0.0001897730879671634, + "loss": 0.8287, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.45903465195702364, + "learning_rate": 0.00018965859420826684, + "loss": 0.74, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5694127309219911, + "learning_rate": 0.00018954349804767184, + "loss": 0.7837, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.4689176702326866, + "learning_rate": 0.00018942780025869098, + "loss": 0.807, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4697709903272958, + "learning_rate": 0.00018931150161867916, + "loss": 0.7965, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.4370312389959886, + "learning_rate": 0.00018919460290902826, + "loss": 0.7761, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.4765623072933397, + "learning_rate": 0.00018907710491516199, + "loss": 0.6862, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.6342609583307439, + "learning_rate": 0.0001889590084265304, + "loss": 0.8148, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4799134250255987, + "learning_rate": 0.0001888403142366049, + "loss": 0.7785, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.48212206697204846, + "learning_rate": 0.0001887210231428727, + "loss": 0.7898, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4655827018767983, + "learning_rate": 0.00018860113594683148, + "loss": 0.8062, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.5265800855384186, + "learning_rate": 0.0001884806534539841, + "loss": 0.8375, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5774244433936847, + "learning_rate": 0.00018835957647383303, + "loss": 0.8595, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.522574221020107, + "learning_rate": 0.0001882379058198751, + "loss": 0.7785, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4658974495563153, + "learning_rate": 0.00018811564230959588, + "loss": 0.7776, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.48850391555818595, + "learning_rate": 0.00018799278676446423, + "loss": 0.7231, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.4900387100247314, + "learning_rate": 0.00018786934000992688, + "loss": 0.7539, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.4470268992096281, + "learning_rate": 0.00018774530287540278, + "loss": 0.8205, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5372584408183166, + "learning_rate": 0.00018762067619427746, + "loss": 0.8523, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.5786736632864303, + "learning_rate": 0.00018749546080389757, + "loss": 0.8516, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.4150327411851708, + "learning_rate": 0.00018736965754556528, + "loss": 0.767, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.4744247607116235, + "learning_rate": 0.00018724326726453244, + "loss": 0.8131, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5212775801914824, + "learning_rate": 0.00018711629080999504, + "loss": 0.7665, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.5315130773715251, + "learning_rate": 0.00018698872903508755, + "loss": 0.8862, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.5811528871742159, + "learning_rate": 0.00018686058279687698, + "loss": 0.8707, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.44617622461432815, + "learning_rate": 0.0001867318529563574, + "loss": 0.7049, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.5110629204553769, + "learning_rate": 0.00018660254037844388, + "loss": 0.7552, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.45105577641636846, + "learning_rate": 0.00018647264593196688, + "loss": 0.7799, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4988548737364565, + "learning_rate": 0.00018634217048966637, + "loss": 0.8198, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.44820254586860836, + "learning_rate": 0.00018621111492818585, + "loss": 0.7746, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5064379659275585, + "learning_rate": 0.0001860794801280666, + "loss": 0.8171, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.6307811170822041, + "learning_rate": 0.00018594726697374175, + "loss": 0.9037, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5709592111841547, + "learning_rate": 0.0001858144763535302, + "loss": 0.8456, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.4827907950086345, + "learning_rate": 0.0001856811091596308, + "loss": 0.7598, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.48715082409785165, + "learning_rate": 0.0001855471662881164, + "loss": 0.8504, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.5022225482599009, + "learning_rate": 0.00018541264863892754, + "loss": 0.8216, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.4461203089196808, + "learning_rate": 0.00018527755711586678, + "loss": 0.8183, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.48705056288283083, + "learning_rate": 0.00018514189262659235, + "loss": 0.8283, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.49774987618821515, + "learning_rate": 0.00018500565608261214, + "loss": 0.8191, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.5339328705007534, + "learning_rate": 0.00018486884839927768, + "loss": 0.8081, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.4791336870020177, + "learning_rate": 0.00018473147049577774, + "loss": 0.7499, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.45800363192527943, + "learning_rate": 0.0001845935232951325, + "loss": 0.7997, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.46519629604160584, + "learning_rate": 0.00018445500772418697, + "loss": 0.8501, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.4715616737015773, + "learning_rate": 0.00018431592471360503, + "loss": 0.7897, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4245101255916471, + "learning_rate": 0.00018417627519786315, + "loss": 0.7595, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.5362847783633511, + "learning_rate": 0.000184036060115244, + "loss": 0.846, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.4482413130364821, + "learning_rate": 0.00018389528040783012, + "loss": 0.7155, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.45952161523507334, + "learning_rate": 0.00018375393702149787, + "loss": 0.77, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5546853267775046, + "learning_rate": 0.00018361203090591071, + "loss": 0.8748, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.5427836514907146, + "learning_rate": 0.00018346956301451304, + "loss": 0.8554, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5170096339247156, + "learning_rate": 0.00018332653430452376, + "loss": 0.8374, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.5048070790052696, + "learning_rate": 0.00018318294573692985, + "loss": 0.7852, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.42519373761514895, + "learning_rate": 0.00018303879827647975, + "loss": 0.7553, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.45921813676682904, + "learning_rate": 0.0001828940928916772, + "loss": 0.7488, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4383175333338921, + "learning_rate": 0.00018274883055477436, + "loss": 0.7668, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.5232721465390389, + "learning_rate": 0.00018260301224176558, + "loss": 0.822, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.4569931890966662, + "learning_rate": 0.00018245663893238075, + "loss": 0.7834, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.40542299303164925, + "learning_rate": 0.00018230971161007853, + "loss": 0.8149, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.44084215043019426, + "learning_rate": 0.00018216223126204007, + "loss": 0.785, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.38904576075065306, + "learning_rate": 0.00018201419887916214, + "loss": 0.7591, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.6117423420347984, + "learning_rate": 0.00018186561545605054, + "loss": 0.8003, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.6406690655995851, + "learning_rate": 0.00018171648199101346, + "loss": 0.8713, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.49705391186855263, + "learning_rate": 0.00018156679948605467, + "loss": 0.7654, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.4477634962335582, + "learning_rate": 0.00018141656894686689, + "loss": 0.7992, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.41943609720436703, + "learning_rate": 0.00018126579138282503, + "loss": 0.7339, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.5026472739157812, + "learning_rate": 0.00018111446780697929, + "loss": 0.835, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.39988503480991233, + "learning_rate": 0.0001809625992360485, + "loss": 0.7375, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.5975134595609247, + "learning_rate": 0.00018081018669041324, + "loss": 0.9192, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.49964857187934897, + "learning_rate": 0.00018065723119410884, + "loss": 0.7672, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.507087097547393, + "learning_rate": 0.00018050373377481878, + "loss": 0.8104, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.49188156989526316, + "learning_rate": 0.00018034969546386757, + "loss": 0.7378, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.5668204474599812, + "learning_rate": 0.0001801951172962139, + "loss": 0.8318, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.49264377554081845, + "learning_rate": 0.0001800400003104436, + "loss": 0.7835, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.4073619173169081, + "learning_rate": 0.0001798843455487629, + "loss": 0.7141, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.49355344807027934, + "learning_rate": 0.00017972815405699103, + "loss": 0.7787, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.4663434613832926, + "learning_rate": 0.00017957142688455362, + "loss": 0.8168, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.5299134075969804, + "learning_rate": 0.00017941416508447536, + "loss": 0.7807, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.5537745502297323, + "learning_rate": 0.00017925636971337304, + "loss": 0.7876, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5671415781776914, + "learning_rate": 0.0001790980418314484, + "loss": 0.8057, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.47147141641940005, + "learning_rate": 0.00017893918250248104, + "loss": 0.7887, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5135239239131087, + "learning_rate": 0.00017877979279382135, + "loss": 0.8399, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.4567751279486991, + "learning_rate": 0.00017861987377638312, + "loss": 0.816, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.530766676935918, + "learning_rate": 0.0001784594265246366, + "loss": 0.8247, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.4392004139140252, + "learning_rate": 0.0001782984521166011, + "loss": 0.7879, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4613367032132493, + "learning_rate": 0.0001781369516338378, + "loss": 0.7787, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.474045585760139, + "learning_rate": 0.00017797492616144256, + "loss": 0.7962, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.5066219495984166, + "learning_rate": 0.00017781237678803847, + "loss": 0.7755, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.48675505607185726, + "learning_rate": 0.00017764930460576866, + "loss": 0.7671, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.47948629724067315, + "learning_rate": 0.000177485710710289, + "loss": 0.7716, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.5599768756820828, + "learning_rate": 0.00017732159620076053, + "loss": 0.7916, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.49403730031729814, + "learning_rate": 0.00017715696217984235, + "loss": 0.7698, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.5301548619115438, + "learning_rate": 0.00017699180975368396, + "loss": 0.7496, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.45832223663782296, + "learning_rate": 0.00017682614003191807, + "loss": 0.8301, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.47129035121504553, + "learning_rate": 0.00017665995412765285, + "loss": 0.7459, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3886867598075655, + "learning_rate": 0.00017649325315746478, + "loss": 0.7442, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.48484427387495405, + "learning_rate": 0.00017632603824139085, + "loss": 0.816, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.47656839689614183, + "learning_rate": 0.0001761583105029213, + "loss": 0.7921, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.48291875194295386, + "learning_rate": 0.0001759900710689918, + "loss": 0.7739, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.43066624272424553, + "learning_rate": 0.00017582132106997616, + "loss": 0.8113, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.4921583706771416, + "learning_rate": 0.00017565206163967846, + "loss": 0.7332, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4635208516179352, + "learning_rate": 0.00017548229391532572, + "loss": 0.8202, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.42605236208308095, + "learning_rate": 0.00017531201903755994, + "loss": 0.7638, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.5928432540931313, + "learning_rate": 0.00017514123815043074, + "loss": 0.8448, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.42223495580594994, + "learning_rate": 0.00017496995240138744, + "loss": 0.8594, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.36180009658690626, + "learning_rate": 0.00017479816294127152, + "loss": 0.718, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.5055409710562159, + "learning_rate": 0.00017462587092430875, + "loss": 0.8116, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.5258821591435593, + "learning_rate": 0.0001744530775081015, + "loss": 0.8743, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.42039164828489956, + "learning_rate": 0.00017427978385362112, + "loss": 0.7835, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5398918499239291, + "learning_rate": 0.0001741059911251997, + "loss": 0.8457, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.517493056428269, + "learning_rate": 0.0001739317004905227, + "loss": 0.801, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4592443902096945, + "learning_rate": 0.000173756913120621, + "loss": 0.7163, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.4570421656040498, + "learning_rate": 0.00017358163018986282, + "loss": 0.7891, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4653670519039479, + "learning_rate": 0.00017340585287594604, + "loss": 0.7404, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.4700823361619639, + "learning_rate": 0.00017322958235989016, + "loss": 0.7739, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.46741465910878005, + "learning_rate": 0.0001730528198260285, + "loss": 0.7353, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.5102078699593384, + "learning_rate": 0.00017287556646200018, + "loss": 0.843, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.5575580445388119, + "learning_rate": 0.00017269782345874203, + "loss": 0.8205, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.5053123994869361, + "learning_rate": 0.00017251959201048083, + "loss": 0.7919, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.47986139425074736, + "learning_rate": 0.00017234087331472497, + "loss": 0.8036, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.4106720871631404, + "learning_rate": 0.00017216166857225674, + "loss": 0.7301, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.45195261587163915, + "learning_rate": 0.00017198197898712404, + "loss": 0.7721, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.4513538733673665, + "learning_rate": 0.00017180180576663228, + "loss": 0.8287, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4916534731703709, + "learning_rate": 0.00017162115012133643, + "loss": 0.7991, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.3944159257016331, + "learning_rate": 0.00017144001326503273, + "loss": 0.6861, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 1.0015835190531766, + "learning_rate": 0.00017125839641475072, + "loss": 0.8172, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.40573909671129543, + "learning_rate": 0.00017107630079074478, + "loss": 0.7758, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.4754532932792592, + "learning_rate": 0.00017089372761648616, + "loss": 0.8139, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.4323421835044929, + "learning_rate": 0.00017071067811865476, + "loss": 0.783, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4247367763012067, + "learning_rate": 0.00017052715352713075, + "loss": 0.7058, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.4444386432236298, + "learning_rate": 0.00017034315507498635, + "loss": 0.7733, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.48628229506613657, + "learning_rate": 0.00017015868399847768, + "loss": 0.8221, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.49253670986195996, + "learning_rate": 0.00016997374153703625, + "loss": 0.8053, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4339843395502114, + "learning_rate": 0.00016978832893326074, + "loss": 0.7737, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.4833521631305639, + "learning_rate": 0.00016960244743290868, + "loss": 0.7774, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5205237546210102, + "learning_rate": 0.00016941609828488807, + "loss": 0.8133, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.3987540237364512, + "learning_rate": 0.00016922928274124886, + "loss": 0.692, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.42948195691635055, + "learning_rate": 0.0001690420020571747, + "loss": 0.7249, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.6465198840665377, + "learning_rate": 0.00016885425749097444, + "loss": 0.8311, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.47249829805105875, + "learning_rate": 0.0001686660503040737, + "loss": 0.7841, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.4261879264130769, + "learning_rate": 0.00016847738176100632, + "loss": 0.7186, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4733907759093097, + "learning_rate": 0.00016828825312940592, + "loss": 0.7866, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.4146452149695671, + "learning_rate": 0.0001680986656799975, + "loss": 0.7119, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.456970736717012, + "learning_rate": 0.0001679086206865886, + "loss": 0.7613, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.4861683419656093, + "learning_rate": 0.00016771811942606108, + "loss": 0.777, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.46398036139960575, + "learning_rate": 0.00016752716317836229, + "loss": 0.7371, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.3741787436059476, + "learning_rate": 0.00016733575322649657, + "loss": 0.691, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.48059657128466404, + "learning_rate": 0.0001671438908565167, + "loss": 0.8145, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.4494500231240095, + "learning_rate": 0.00016695157735751513, + "loss": 0.7534, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.5289891737414257, + "learning_rate": 0.00016675881402161536, + "loss": 0.7592, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.5238508441860991, + "learning_rate": 0.0001665656021439633, + "loss": 0.7654, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5006747022563904, + "learning_rate": 0.0001663719430227186, + "loss": 0.7694, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.5261095172263893, + "learning_rate": 0.00016617783795904565, + "loss": 0.7961, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4465164232516254, + "learning_rate": 0.00016598328825710533, + "loss": 0.7719, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.5509450648508075, + "learning_rate": 0.00016578829522404583, + "loss": 0.8502, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4966173074903064, + "learning_rate": 0.000165592860169994, + "loss": 0.7722, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.4979829819052925, + "learning_rate": 0.00016539698440804661, + "loss": 0.8141, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.4070899764023006, + "learning_rate": 0.00016520066925426144, + "loss": 0.7126, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.5277470259602063, + "learning_rate": 0.0001650039160276485, + "loss": 0.7905, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.55268793628899, + "learning_rate": 0.0001648067260501611, + "loss": 0.7444, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.4661807766951434, + "learning_rate": 0.0001646091006466871, + "loss": 0.7516, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.43528338047912696, + "learning_rate": 0.0001644110411450398, + "loss": 0.7993, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.5169034933901338, + "learning_rate": 0.00016421254887594917, + "loss": 0.7944, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4659723705564739, + "learning_rate": 0.00016401362517305296, + "loss": 0.7806, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.4621920855334705, + "learning_rate": 0.00016381427137288754, + "loss": 0.7643, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.5033112857376137, + "learning_rate": 0.00016361448881487914, + "loss": 0.7568, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.45914237617703174, + "learning_rate": 0.0001634142788413346, + "loss": 0.7571, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.4811632118631198, + "learning_rate": 0.00016321364279743266, + "loss": 0.7865, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.47428555365500835, + "learning_rate": 0.00016301258203121462, + "loss": 0.7615, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.41642206878293037, + "learning_rate": 0.0001628110978935756, + "loss": 0.7635, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.46410825848106113, + "learning_rate": 0.00016260919173825508, + "loss": 0.7795, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4749681805721738, + "learning_rate": 0.00016240686492182804, + "loss": 0.8257, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.4230544131011867, + "learning_rate": 0.00016220411880369601, + "loss": 0.7637, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.3919205953867002, + "learning_rate": 0.00016200095474607753, + "loss": 0.729, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.4645646620710507, + "learning_rate": 0.00016179737411399926, + "loss": 0.7632, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5226866117554707, + "learning_rate": 0.00016159337827528685, + "loss": 0.7977, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.4532133815875697, + "learning_rate": 0.00016138896860055555, + "loss": 0.819, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.4800058122866018, + "learning_rate": 0.0001611841464632011, + "loss": 0.742, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.40963900125414177, + "learning_rate": 0.00016097891323939062, + "loss": 0.7623, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4345153368300366, + "learning_rate": 0.0001607732703080532, + "loss": 0.7405, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.3888545931609104, + "learning_rate": 0.00016056721905087056, + "loss": 0.7214, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.3726771135751388, + "learning_rate": 0.00016036076085226814, + "loss": 0.6504, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.4961883258832556, + "learning_rate": 0.00016015389709940538, + "loss": 0.784, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4808155942240947, + "learning_rate": 0.0001599466291821666, + "loss": 0.7843, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.4360198562884242, + "learning_rate": 0.0001597389584931517, + "loss": 0.7447, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.4290451599095809, + "learning_rate": 0.0001595308864276666, + "loss": 0.7496, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.49075497550809033, + "learning_rate": 0.0001593224143837142, + "loss": 0.7901, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.433975566052962, + "learning_rate": 0.0001591135437619847, + "loss": 0.8071, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.43455806077802206, + "learning_rate": 0.00015890427596584617, + "loss": 0.7979, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.39654183107816543, + "learning_rate": 0.0001586946124013354, + "loss": 0.7455, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.4027747900164966, + "learning_rate": 0.00015848455447714822, + "loss": 0.7612, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5215110040861919, + "learning_rate": 0.0001582741036046301, + "loss": 0.8311, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.458476625944734, + "learning_rate": 0.00015806326119776663, + "loss": 0.6957, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.5032765612976655, + "learning_rate": 0.00015785202867317407, + "loss": 0.7318, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.4282328303611774, + "learning_rate": 0.00015764040745008988, + "loss": 0.7315, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4283810020409699, + "learning_rate": 0.00015742839895036305, + "loss": 0.8376, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.3956368192508352, + "learning_rate": 0.00015721600459844468, + "loss": 0.6642, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.49128944752081743, + "learning_rate": 0.00015700322582137827, + "loss": 0.7537, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.4001949751491434, + "learning_rate": 0.00015679006404879033, + "loss": 0.736, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4477164246360966, + "learning_rate": 0.0001565765207128805, + "loss": 0.7402, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.4031274937697729, + "learning_rate": 0.00015636259724841222, + "loss": 0.6603, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.43057763802374044, + "learning_rate": 0.0001561482950927029, + "loss": 0.8079, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.43719760308136896, + "learning_rate": 0.00015593361568561428, + "loss": 0.6774, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3676676205070124, + "learning_rate": 0.00015571856046954285, + "loss": 0.6877, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.458911809801583, + "learning_rate": 0.0001555031308894101, + "loss": 0.8522, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5571317205735612, + "learning_rate": 0.00015528732839265272, + "loss": 0.8047, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.48626898520583006, + "learning_rate": 0.0001550711544292131, + "loss": 0.8081, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.5310547052797169, + "learning_rate": 0.0001548546104515294, + "loss": 0.8538, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.45137021273237177, + "learning_rate": 0.00015463769791452574, + "loss": 0.7976, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.41415089507065106, + "learning_rate": 0.00015442041827560274, + "loss": 0.7359, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.37909456905958816, + "learning_rate": 0.00015420277299462736, + "loss": 0.6809, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.48780051331278546, + "learning_rate": 0.00015398476353392323, + "loss": 0.7655, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.4580221180681622, + "learning_rate": 0.00015376639135826107, + "loss": 0.7339, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4132785494038251, + "learning_rate": 0.00015354765793484834, + "loss": 0.7837, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.3944766874177697, + "learning_rate": 0.00015332856473331978, + "loss": 0.7302, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4037477141523566, + "learning_rate": 0.00015310911322572753, + "loss": 0.7347, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.44285585008417, + "learning_rate": 0.00015288930488653094, + "loss": 0.7709, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.46780597074089453, + "learning_rate": 0.000152669141192587, + "loss": 0.6999, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.43483794888344374, + "learning_rate": 0.0001524486236231402, + "loss": 0.7685, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4245764749893502, + "learning_rate": 0.00015222775365981273, + "loss": 0.7185, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.41944137971055734, + "learning_rate": 0.00015200653278659432, + "loss": 0.7137, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4152331799504103, + "learning_rate": 0.00015178496248983254, + "loss": 0.7186, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.3838060086477032, + "learning_rate": 0.00015156304425822267, + "loss": 0.686, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.5027524972661573, + "learning_rate": 0.00015134077958279765, + "loss": 0.7165, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.451858046202718, + "learning_rate": 0.00015111816995691809, + "loss": 0.7389, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.44750570537460616, + "learning_rate": 0.00015089521687626243, + "loss": 0.8162, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.4193427784451634, + "learning_rate": 0.00015067192183881658, + "loss": 0.7083, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.3985119493318944, + "learning_rate": 0.000150448286344864, + "loss": 0.7839, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.6216787661550495, + "learning_rate": 0.00015022431189697568, + "loss": 0.8906, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4942803545738827, + "learning_rate": 0.00015000000000000001, + "loss": 0.8168, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.42018726726070965, + "learning_rate": 0.0001497753521610526, + "loss": 0.6902, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.43343226435576415, + "learning_rate": 0.00014955036988950618, + "loss": 0.7445, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.42222783049209456, + "learning_rate": 0.00014932505469698052, + "loss": 0.7477, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4997280795026545, + "learning_rate": 0.00014909940809733222, + "loss": 0.8327, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.4509750043267108, + "learning_rate": 0.0001488734316066446, + "loss": 0.7193, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5435894942324518, + "learning_rate": 0.00014864712674321734, + "loss": 0.8441, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.5690852374518198, + "learning_rate": 0.0001484204950275565, + "loss": 0.8191, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.5901781835149241, + "learning_rate": 0.00014819353798236427, + "loss": 0.8416, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.44095547117052597, + "learning_rate": 0.00014796625713252848, + "loss": 0.6817, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.410082351206514, + "learning_rate": 0.00014773865400511272, + "loss": 0.6844, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.433390304861033, + "learning_rate": 0.00014751073012934587, + "loss": 0.7151, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4673148852916645, + "learning_rate": 0.00014728248703661182, + "loss": 0.7742, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.5629792615802157, + "learning_rate": 0.0001470539262604393, + "loss": 0.6944, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4197799181102277, + "learning_rate": 0.00014682504933649144, + "loss": 0.7866, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.42309809359489753, + "learning_rate": 0.00014659585780255556, + "loss": 0.6679, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.5107005715491209, + "learning_rate": 0.00014636635319853275, + "loss": 0.7455, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.4033634928151588, + "learning_rate": 0.0001461365370664276, + "loss": 0.7024, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.4384657706250393, + "learning_rate": 0.00014590641095033787, + "loss": 0.739, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.4504037806901727, + "learning_rate": 0.00014567597639644387, + "loss": 0.7991, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.48886688910936515, + "learning_rate": 0.00014544523495299842, + "loss": 0.7435, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.41267602272551973, + "learning_rate": 0.00014521418817031628, + "loss": 0.7444, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4115970936792021, + "learning_rate": 0.0001449828376007636, + "loss": 0.696, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.38804305128365696, + "learning_rate": 0.00014475118479874774, + "loss": 0.7202, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.45562899264909823, + "learning_rate": 0.0001445192313207067, + "loss": 0.7737, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.47743243253657847, + "learning_rate": 0.0001442869787250987, + "loss": 0.758, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4395717414306346, + "learning_rate": 0.0001440544285723915, + "loss": 0.6759, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.5150009930824007, + "learning_rate": 0.00014382158242505234, + "loss": 0.8263, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.44237543082679537, + "learning_rate": 0.00014358844184753712, + "loss": 0.7734, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.4931934397534956, + "learning_rate": 0.00014335500840627986, + "loss": 0.7782, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.45668570071819947, + "learning_rate": 0.00014312128366968243, + "loss": 0.7347, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.44567464720174327, + "learning_rate": 0.0001428872692081038, + "loss": 0.764, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.40635707404287097, + "learning_rate": 0.00014265296659384956, + "loss": 0.7772, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.4730844272067816, + "learning_rate": 0.00014241837740116132, + "loss": 0.7864, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.42541835491027025, + "learning_rate": 0.00014218350320620624, + "loss": 0.7079, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.5089978868540131, + "learning_rate": 0.00014194834558706632, + "loss": 0.7802, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5211685215608624, + "learning_rate": 0.0001417129061237278, + "loss": 0.8409, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.47327225076322404, + "learning_rate": 0.0001414771863980707, + "loss": 0.8571, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.40936558132508094, + "learning_rate": 0.00014124118799385796, + "loss": 0.7973, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.4362419996174767, + "learning_rate": 0.00014100491249672498, + "loss": 0.7758, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.5238046230383762, + "learning_rate": 0.00014076836149416887, + "loss": 0.7709, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.5497251848678121, + "learning_rate": 0.0001405315365755379, + "loss": 0.7611, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.45884982502845467, + "learning_rate": 0.0001402944393320206, + "loss": 0.7774, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.4012956728140862, + "learning_rate": 0.00014005707135663527, + "loss": 0.7038, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.46963958742490086, + "learning_rate": 0.00013981943424421932, + "loss": 0.7212, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.414724977340573, + "learning_rate": 0.00013958152959141825, + "loss": 0.7483, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.45545795630258695, + "learning_rate": 0.00013934335899667527, + "loss": 0.8236, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.4080879001104178, + "learning_rate": 0.00013910492406022033, + "loss": 0.6612, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.5728231130149111, + "learning_rate": 0.00013886622638405952, + "loss": 0.8943, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.4048483515111806, + "learning_rate": 0.0001386272675719642, + "loss": 0.7335, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3714512698859433, + "learning_rate": 0.00013838804922946027, + "loss": 0.7355, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.4632496150613504, + "learning_rate": 0.00013814857296381728, + "loss": 0.8119, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.44023327521008365, + "learning_rate": 0.00013790884038403795, + "loss": 0.791, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.4715924508488183, + "learning_rate": 0.00013766885310084688, + "loss": 0.7535, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.44552213416424963, + "learning_rate": 0.00013742861272668012, + "loss": 0.7662, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.4097714152317744, + "learning_rate": 0.00013718812087567414, + "loss": 0.6585, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.41866497088119065, + "learning_rate": 0.00013694737916365517, + "loss": 0.6993, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.4188938513957478, + "learning_rate": 0.000136706389208128, + "loss": 0.6517, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.414161560358882, + "learning_rate": 0.00013646515262826552, + "loss": 0.701, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.38747983953923315, + "learning_rate": 0.00013622367104489756, + "loss": 0.6629, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.42397825009703494, + "learning_rate": 0.0001359819460805001, + "loss": 0.7227, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.5074637517864383, + "learning_rate": 0.0001357399793591844, + "loss": 0.7306, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.41451661042263027, + "learning_rate": 0.0001354977725066859, + "loss": 0.7548, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.5207857570485644, + "learning_rate": 0.00013525532715035366, + "loss": 0.8035, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.48617295332934835, + "learning_rate": 0.00013501264491913906, + "loss": 0.8309, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.3849126756616638, + "learning_rate": 0.00013476972744358507, + "loss": 0.6318, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.44966786994539865, + "learning_rate": 0.0001345265763558152, + "loss": 0.7125, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.4998498402001065, + "learning_rate": 0.00013428319328952253, + "loss": 0.7635, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.40824096736907733, + "learning_rate": 0.00013403957987995882, + "loss": 0.8029, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.42493678327155465, + "learning_rate": 0.0001337957377639235, + "loss": 0.7078, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4643039147700563, + "learning_rate": 0.0001335516685797525, + "loss": 0.763, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.451550065523837, + "learning_rate": 0.0001333073739673076, + "loss": 0.7453, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3962407481953412, + "learning_rate": 0.00013306285556796495, + "loss": 0.6989, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.4240972513922981, + "learning_rate": 0.0001328181150246045, + "loss": 0.6977, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.41336338106308385, + "learning_rate": 0.00013257315398159864, + "loss": 0.687, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.37027685712700015, + "learning_rate": 0.00013232797408480127, + "loss": 0.6883, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.40170706197125805, + "learning_rate": 0.00013208257698153677, + "loss": 0.7568, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.4554872123840497, + "learning_rate": 0.00013183696432058888, + "loss": 0.7447, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.40574395432970756, + "learning_rate": 0.00013159113775218964, + "loss": 0.7286, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.48608042117887207, + "learning_rate": 0.00013134509892800822, + "loss": 0.7384, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.45949040952795284, + "learning_rate": 0.00013109884950114007, + "loss": 0.7059, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.42454813479036535, + "learning_rate": 0.00013085239112609547, + "loss": 0.7408, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.42970061008444677, + "learning_rate": 0.00013060572545878875, + "loss": 0.7353, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.415338461458164, + "learning_rate": 0.00013035885415652685, + "loss": 0.7682, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.40326020822765896, + "learning_rate": 0.00013011177887799845, + "loss": 0.7332, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.4696858589276105, + "learning_rate": 0.00012986450128326266, + "loss": 0.8016, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3866997010269802, + "learning_rate": 0.00012961702303373795, + "loss": 0.7031, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.4302858454821888, + "learning_rate": 0.00012936934579219094, + "loss": 0.7477, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.5291243986938438, + "learning_rate": 0.00012912147122272523, + "loss": 0.7281, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.414861003656148, + "learning_rate": 0.00012887340099077024, + "loss": 0.6288, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3745285570911487, + "learning_rate": 0.00012862513676307008, + "loss": 0.6187, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.4433836524373664, + "learning_rate": 0.0001283766802076722, + "loss": 0.7558, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.41415406260115645, + "learning_rate": 0.00012812803299391628, + "loss": 0.6811, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.5928849942784749, + "learning_rate": 0.00012787919679242306, + "loss": 0.8649, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4460533812963949, + "learning_rate": 0.00012763017327508305, + "loss": 0.7696, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.4322610863341201, + "learning_rate": 0.00012738096411504522, + "loss": 0.7364, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.43468893082608867, + "learning_rate": 0.0001271315709867059, + "loss": 0.7042, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.4279277608921427, + "learning_rate": 0.00012688199556569753, + "loss": 0.7676, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.4740815355097255, + "learning_rate": 0.00012663223952887723, + "loss": 0.7874, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.4662686508421357, + "learning_rate": 0.0001263823045543158, + "loss": 0.7279, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.45311482424332367, + "learning_rate": 0.00012613219232128608, + "loss": 0.7358, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.5402236483780922, + "learning_rate": 0.00012588190451025207, + "loss": 0.7455, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.37960441149392116, + "learning_rate": 0.00012563144280285741, + "loss": 0.6726, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.6249639514914175, + "learning_rate": 0.00012538080888191408, + "loss": 0.8621, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.5331954264682194, + "learning_rate": 0.00012513000443139112, + "loss": 0.8619, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.43736806035408327, + "learning_rate": 0.00012487903113640337, + "loss": 0.7501, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.39133146091624577, + "learning_rate": 0.00012462789068320017, + "loss": 0.6914, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.5933312796253537, + "learning_rate": 0.00012437658475915377, + "loss": 0.9193, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.3847507085275376, + "learning_rate": 0.00012412511505274844, + "loss": 0.7759, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.363405009433934, + "learning_rate": 0.00012387348325356874, + "loss": 0.7039, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.42582514682408823, + "learning_rate": 0.00012362169105228826, + "loss": 0.7756, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.4090049364624364, + "learning_rate": 0.00012336974014065844, + "loss": 0.8281, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.3738580177207341, + "learning_rate": 0.000123117632211497, + "loss": 0.7165, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.4863876051491984, + "learning_rate": 0.00012286536895867654, + "loss": 0.8025, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.45352490918087, + "learning_rate": 0.00012261295207711346, + "loss": 0.7511, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.42160383400991047, + "learning_rate": 0.00012236038326275626, + "loss": 0.7283, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4083416424869398, + "learning_rate": 0.0001221076642125742, + "loss": 0.7113, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.4499074190268108, + "learning_rate": 0.00012185479662454595, + "loss": 0.7747, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.5312004411706165, + "learning_rate": 0.00012160178219764837, + "loss": 0.7676, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.4181180073120208, + "learning_rate": 0.00012134862263184467, + "loss": 0.7128, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3742699657921233, + "learning_rate": 0.00012109531962807332, + "loss": 0.7277, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.41547253781863375, + "learning_rate": 0.00012084187488823657, + "loss": 0.7226, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4125901670375791, + "learning_rate": 0.00012058829011518896, + "loss": 0.7637, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.3940323995879189, + "learning_rate": 0.00012033456701272576, + "loss": 0.7455, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.44271352119465857, + "learning_rate": 0.00012008070728557186, + "loss": 0.7134, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.420556415407943, + "learning_rate": 0.00011982671263936995, + "loss": 0.6852, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.5086736657220369, + "learning_rate": 0.00011957258478066931, + "loss": 0.7157, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.5047448006462422, + "learning_rate": 0.00011931832541691418, + "loss": 0.7405, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.47623346942205214, + "learning_rate": 0.00011906393625643244, + "loss": 0.6759, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.435263941647073, + "learning_rate": 0.00011880941900842397, + "loss": 0.7728, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4351854503352841, + "learning_rate": 0.00011855477538294935, + "loss": 0.6759, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.4412934491702434, + "learning_rate": 0.00011830000709091815, + "loss": 0.7063, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.41445704404686395, + "learning_rate": 0.00011804511584407763, + "loss": 0.7044, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.45066220450604044, + "learning_rate": 0.0001177901033550012, + "loss": 0.7094, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4233844751789096, + "learning_rate": 0.00011753497133707679, + "loss": 0.6544, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.5000831029823927, + "learning_rate": 0.00011727972150449544, + "loss": 0.776, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.418103957155811, + "learning_rate": 0.00011702435557223987, + "loss": 0.7505, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.3835911317292797, + "learning_rate": 0.00011676887525607271, + "loss": 0.7067, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.38346700490955143, + "learning_rate": 0.00011651328227252517, + "loss": 0.7269, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.4428443387426583, + "learning_rate": 0.00011625757833888551, + "loss": 0.7108, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.41075676169897724, + "learning_rate": 0.00011600176517318741, + "loss": 0.7651, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.5108612361056679, + "learning_rate": 0.0001157458444941984, + "loss": 0.703, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3939003031118122, + "learning_rate": 0.00011548981802140848, + "loss": 0.7866, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.44288146767279274, + "learning_rate": 0.00011523368747501839, + "loss": 0.8414, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.6436151513443139, + "learning_rate": 0.00011497745457592816, + "loss": 0.727, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.5314724432682242, + "learning_rate": 0.00011472112104572547, + "loss": 0.6771, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.41804944318320336, + "learning_rate": 0.00011446468860667421, + "loss": 0.723, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.43385673641757744, + "learning_rate": 0.0001142081589817027, + "loss": 0.6596, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.43972284723540145, + "learning_rate": 0.00011395153389439233, + "loss": 0.7128, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.4130285489578436, + "learning_rate": 0.00011369481506896582, + "loss": 0.7377, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.39400294155797677, + "learning_rate": 0.00011343800423027582, + "loss": 0.7172, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.3556266444230241, + "learning_rate": 0.00011318110310379301, + "loss": 0.6508, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3922797153545315, + "learning_rate": 0.0001129241134155949, + "loss": 0.733, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.42557567070328217, + "learning_rate": 0.00011266703689235394, + "loss": 0.6811, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.47518565761801196, + "learning_rate": 0.00011240987526132594, + "loss": 0.6853, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.5243789972413347, + "learning_rate": 0.00011215263025033869, + "loss": 0.7605, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.596499320293954, + "learning_rate": 0.00011189530358778005, + "loss": 0.845, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.5678827706833086, + "learning_rate": 0.00011163789700258655, + "loss": 0.7664, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.46971601029972, + "learning_rate": 0.00011138041222423177, + "loss": 0.7971, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.448639421281421, + "learning_rate": 0.00011112285098271451, + "loss": 0.7541, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.46375060985537697, + "learning_rate": 0.00011086521500854745, + "loss": 0.7397, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.48939844130058974, + "learning_rate": 0.00011060750603274535, + "loss": 0.7442, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3873308221390546, + "learning_rate": 0.00011034972578681338, + "loss": 0.6937, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.5384616961392372, + "learning_rate": 0.00011009187600273566, + "loss": 0.7938, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.47470324845901535, + "learning_rate": 0.00010983395841296348, + "loss": 0.8125, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.42177784078337344, + "learning_rate": 0.00010957597475040373, + "loss": 0.744, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4934661926935679, + "learning_rate": 0.00010931792674840718, + "loss": 0.7409, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.4613479157092345, + "learning_rate": 0.00010905981614075693, + "loss": 0.779, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.5452939935438601, + "learning_rate": 0.00010880164466165674, + "loss": 0.8148, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.4508393845728654, + "learning_rate": 0.00010854341404571928, + "loss": 0.7924, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4565799747480313, + "learning_rate": 0.00010828512602795462, + "loss": 0.737, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.519353136825866, + "learning_rate": 0.00010802678234375851, + "loss": 0.8351, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4497705570041638, + "learning_rate": 0.00010776838472890065, + "loss": 0.7249, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.3677815627301932, + "learning_rate": 0.0001075099349195131, + "loss": 0.6891, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4838852628668298, + "learning_rate": 0.00010725143465207867, + "loss": 0.7266, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.5254859830223015, + "learning_rate": 0.00010699288566341914, + "loss": 0.7767, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3919541556072212, + "learning_rate": 0.00010673428969068364, + "loss": 0.7107, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.42827431072546535, + "learning_rate": 0.000106475648471337, + "loss": 0.7075, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.46837510300320306, + "learning_rate": 0.00010621696374314807, + "loss": 0.7855, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.5506398108077281, + "learning_rate": 0.00010595823724417795, + "loss": 0.8101, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.44938366772991534, + "learning_rate": 0.00010569947071276847, + "loss": 0.7572, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.4602391712132883, + "learning_rate": 0.00010544066588753044, + "loss": 0.7316, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.46875722931233516, + "learning_rate": 0.00010518182450733186, + "loss": 0.7303, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.4697615865717118, + "learning_rate": 0.00010492294831128641, + "loss": 0.7817, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.41354754102731717, + "learning_rate": 0.00010466403903874176, + "loss": 0.7188, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.3805016393702369, + "learning_rate": 0.00010440509842926767, + "loss": 0.6743, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5054227844560674, + "learning_rate": 0.00010414612822264455, + "loss": 0.7994, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.46229371886933873, + "learning_rate": 0.00010388713015885161, + "loss": 0.7079, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.4376660706476633, + "learning_rate": 0.00010362810597805526, + "loss": 0.7244, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.4004980165850154, + "learning_rate": 0.00010336905742059742, + "loss": 0.7255, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.5464876140864391, + "learning_rate": 0.0001031099862269837, + "loss": 0.7595, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.45246774575417864, + "learning_rate": 0.0001028508941378719, + "loss": 0.7115, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.42679202516536824, + "learning_rate": 0.00010259178289406011, + "loss": 0.7379, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.5662659497458966, + "learning_rate": 0.00010233265423647523, + "loss": 0.8379, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.45926997631132005, + "learning_rate": 0.00010207350990616107, + "loss": 0.7837, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.41818847338701415, + "learning_rate": 0.00010181435164426676, + "loss": 0.6759, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.46659325022724846, + "learning_rate": 0.0001015551811920351, + "loss": 0.6766, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.43217241869721623, + "learning_rate": 0.00010129600029079072, + "loss": 0.7169, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.5306198437580659, + "learning_rate": 0.00010103681068192845, + "loss": 0.739, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.5882053404532518, + "learning_rate": 0.00010077761410690172, + "loss": 0.7938, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.5712201539772703, + "learning_rate": 0.00010051841230721065, + "loss": 0.8035, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.5543194754396824, + "learning_rate": 0.00010025920702439051, + "loss": 0.7287, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.6361289588756656, + "learning_rate": 0.0001, + "loss": 0.6836, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.5003807411642396, + "learning_rate": 9.97407929756095e-05, + "loss": 0.8057, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.4805444307900423, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7293, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.4459141956754468, + "learning_rate": 9.92223858930983e-05, + "loss": 0.7287, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4061230096741999, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6775, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.41600117805833925, + "learning_rate": 9.870399970920932e-05, + "loss": 0.6779, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.44343437229655974, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7739, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.42608066265593547, + "learning_rate": 9.818564835573323e-05, + "loss": 0.7031, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3714372008628129, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6887, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.43043174543791074, + "learning_rate": 9.766734576352478e-05, + "loss": 0.7342, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.5226011484213909, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7462, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.4662867219259712, + "learning_rate": 9.714910586212816e-05, + "loss": 0.7653, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4187437731003551, + "learning_rate": 9.689001377301633e-05, + "loss": 0.663, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.5355323628621518, + "learning_rate": 9.663094257940258e-05, + "loss": 0.7373, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4737265018487043, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7729, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.4865560065340418, + "learning_rate": 9.611286984114841e-05, + "loss": 0.7206, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.5026776434756831, + "learning_rate": 9.585387177735547e-05, + "loss": 0.8127, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.5283868792090227, + "learning_rate": 9.559490157073236e-05, + "loss": 0.7204, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3719591365008973, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6594, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.4619509085697814, + "learning_rate": 9.507705168871358e-05, + "loss": 0.7127, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.40751957206512246, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6669, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.5286405314124261, + "learning_rate": 9.455933411246958e-05, + "loss": 0.718, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.468664846462301, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7331, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.45131184728020785, + "learning_rate": 9.404176275582208e-05, + "loss": 0.7606, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3829838559164893, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6796, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.5079153239489907, + "learning_rate": 9.352435152866298e-05, + "loss": 0.7025, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.5085941273721384, + "learning_rate": 9.326571030931637e-05, + "loss": 0.9081, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.5449010666211154, + "learning_rate": 9.300711433658087e-05, + "loss": 0.8133, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.43887401346898125, + "learning_rate": 9.274856534792138e-05, + "loss": 0.761, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.45721165998038454, + "learning_rate": 9.249006508048694e-05, + "loss": 0.6743, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3955203395625884, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6766, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.38969950727053226, + "learning_rate": 9.197321765624152e-05, + "loss": 0.6313, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.47811786297910214, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7201, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.4412149235369839, + "learning_rate": 9.145658595428074e-05, + "loss": 0.6747, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.42196332818323135, + "learning_rate": 9.119835533834331e-05, + "loss": 0.742, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.4458398334329028, + "learning_rate": 9.09401838592431e-05, + "loss": 0.6956, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.5318092860659093, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7957, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.4930020597088423, + "learning_rate": 9.04240252495963e-05, + "loss": 0.7279, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.5017236219515888, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7046, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.4630320439388713, + "learning_rate": 8.990812399726435e-05, + "loss": 0.684, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4227403678850825, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7033, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.4829551597107614, + "learning_rate": 8.939249396725467e-05, + "loss": 0.7424, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.46945474281683686, + "learning_rate": 8.913478499145254e-05, + "loss": 0.8187, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.5496894171200998, + "learning_rate": 8.887714901728551e-05, + "loss": 0.7374, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4663964552750982, + "learning_rate": 8.861958777576827e-05, + "loss": 0.8222, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.4019324549889185, + "learning_rate": 8.836210299741346e-05, + "loss": 0.6636, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.4227083940417562, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7451, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.464047932365196, + "learning_rate": 8.784736974966135e-05, + "loss": 0.812, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.49827929688770517, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7623, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.4494487600877898, + "learning_rate": 8.733296310764611e-05, + "loss": 0.7798, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.39968387273888206, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6776, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.4262359246978625, + "learning_rate": 8.6818896896207e-05, + "loss": 0.7656, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4307000807129045, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7657, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.36881199515602503, + "learning_rate": 8.63051849310342e-05, + "loss": 0.7187, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.44543571007999266, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7846, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.41884765278422886, + "learning_rate": 8.579184101829734e-05, + "loss": 0.7264, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.4501268207049666, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6708, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.44298997651470345, + "learning_rate": 8.527887895427454e-05, + "loss": 0.7435, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.4490464030296316, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6915, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.4096812132035904, + "learning_rate": 8.476631252498162e-05, + "loss": 0.68, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.44334191450228255, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7234, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.4065765469269847, + "learning_rate": 8.425415550580162e-05, + "loss": 0.6803, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.6014789132852144, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6985, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.5409184779201434, + "learning_rate": 8.374242166111448e-05, + "loss": 0.7104, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4708620813419675, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6942, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.3991163373887498, + "learning_rate": 8.323112474392731e-05, + "loss": 0.6586, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.37935939327845475, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6761, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.399493088326328, + "learning_rate": 8.272027849550457e-05, + "loss": 0.7024, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4132077291438483, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6957, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.8187355975815758, + "learning_rate": 8.220989664499878e-05, + "loss": 0.7509, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.40872121952512946, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6791, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.4572049418635426, + "learning_rate": 8.169999290908188e-05, + "loss": 0.7393, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5404394783118169, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7402, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.4291348543194726, + "learning_rate": 8.119058099157604e-05, + "loss": 0.6862, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.48511918081626154, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7775, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.5010967847627927, + "learning_rate": 8.068167458308582e-05, + "loss": 0.7399, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.4238614997566764, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7594, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.4129108067302885, + "learning_rate": 8.017328736063006e-05, + "loss": 0.6705, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.3801850715707057, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6847, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.3850503416440505, + "learning_rate": 7.966543298727425e-05, + "loss": 0.6425, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4322390793478073, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7555, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.40808245632812906, + "learning_rate": 7.915812511176347e-05, + "loss": 0.7237, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.43661810536641515, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7242, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.4511391167280754, + "learning_rate": 7.865137736815535e-05, + "loss": 0.6379, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.44596464124334795, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6934, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.4407154181297479, + "learning_rate": 7.814520337545406e-05, + "loss": 0.7147, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.45653935500776016, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7739, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.40727134095533485, + "learning_rate": 7.763961673724379e-05, + "loss": 0.6808, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.43313671686290234, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6762, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.5747822773258235, + "learning_rate": 7.713463104132345e-05, + "loss": 0.7176, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.49156474674882356, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7738, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.4825352538155201, + "learning_rate": 7.663025985934158e-05, + "loss": 0.7841, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3891821855316537, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6461, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.5184218795544006, + "learning_rate": 7.61265167464313e-05, + "loss": 0.7965, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4545607963945075, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6857, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.399987496419545, + "learning_rate": 7.562341524084623e-05, + "loss": 0.6916, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.43975174551870333, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7624, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.43333418303998245, + "learning_rate": 7.512096886359664e-05, + "loss": 0.7543, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.4047273320690393, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7068, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.4416971341780052, + "learning_rate": 7.461919111808595e-05, + "loss": 0.6352, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4464883470061259, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6834, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.4425251211091578, + "learning_rate": 7.411809548974792e-05, + "loss": 0.7237, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4339067833924351, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6823, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.37628247079657295, + "learning_rate": 7.361769544568425e-05, + "loss": 0.678, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.504222130084519, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7172, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.4923074669303458, + "learning_rate": 7.311800443430251e-05, + "loss": 0.8012, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.552459322864626, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7479, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.45851662219750045, + "learning_rate": 7.26190358849548e-05, + "loss": 0.663, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.39770811850719884, + "learning_rate": 7.236982672491698e-05, + "loss": 0.66, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.5387005644882501, + "learning_rate": 7.212080320757695e-05, + "loss": 0.76, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4120007240669047, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7223, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.4529312181035841, + "learning_rate": 7.162331979232783e-05, + "loss": 0.7491, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3880839627189889, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6217, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.3613010136847369, + "learning_rate": 7.112659900922976e-05, + "loss": 0.6757, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.47210927255794144, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7446, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.3725482277294428, + "learning_rate": 7.06306542078091e-05, + "loss": 0.6791, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.4681099311515542, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7608, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.37805816611073806, + "learning_rate": 7.013549871673736e-05, + "loss": 0.6693, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.49364379425671195, + "learning_rate": 6.988822112200156e-05, + "loss": 0.687, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.4757654358296188, + "learning_rate": 6.964114584347316e-05, + "loss": 0.654, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.41975469935854065, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6817, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.4948237468744432, + "learning_rate": 6.914760887390452e-05, + "loss": 0.7695, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.43692541122160833, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6984, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.48637179718674667, + "learning_rate": 6.865490107199181e-05, + "loss": 0.6883, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4595795984986259, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7694, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.4577933808320737, + "learning_rate": 6.816303567941112e-05, + "loss": 0.7038, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.476120584644686, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7576, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.5645954033128227, + "learning_rate": 6.767202591519875e-05, + "loss": 0.7832, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.38318261143506066, + "learning_rate": 6.742684601840141e-05, + "loss": 0.6781, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.46016469252035364, + "learning_rate": 6.718188497539554e-05, + "loss": 0.6898, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4766016729359985, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6492, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.493164015223603, + "learning_rate": 6.669262603269246e-05, + "loss": 0.7555, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.48558809975651357, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7621, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.41546306411218065, + "learning_rate": 6.620426223607654e-05, + "loss": 0.684, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.41016241103210943, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7196, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.47752129112664465, + "learning_rate": 6.571680671047749e-05, + "loss": 0.7473, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.4856038759001054, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7563, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.5021371546458332, + "learning_rate": 6.523027255641493e-05, + "loss": 0.7234, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.40910017528170856, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6213, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.4263522002359121, + "learning_rate": 6.474467284964634e-05, + "loss": 0.7287, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5031930708975815, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7488, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.4145521757788483, + "learning_rate": 6.426002064081565e-05, + "loss": 0.7347, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.41344524756191847, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7244, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.49117563386202884, + "learning_rate": 6.377632895510248e-05, + "loss": 0.6522, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4459330410255157, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6059, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.44548999592648314, + "learning_rate": 6.329361079187199e-05, + "loss": 0.6308, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.4486957922043369, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6552, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.4025693300248295, + "learning_rate": 6.281187912432587e-05, + "loss": 0.6665, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4006004171665509, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6983, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.4082360091469701, + "learning_rate": 6.233114689915316e-05, + "loss": 0.7635, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.509426336326775, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7777, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.5219702631152514, + "learning_rate": 6.18514270361827e-05, + "loss": 0.7026, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.409433339007301, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6858, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.5111445742851393, + "learning_rate": 6.13727324280358e-05, + "loss": 0.7345, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4278440622908864, + "learning_rate": 6.113377361594049e-05, + "loss": 0.5927, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.40575034713415187, + "learning_rate": 6.08950759397797e-05, + "loss": 0.702, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.47102491851325967, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6909, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.43463529412558344, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.7479, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.514778899493937, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7814, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.3801411112782445, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.7259, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4355576096171972, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6514, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.4325118593920335, + "learning_rate": 5.946846342446214e-05, + "loss": 0.6931, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.4258437731663501, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7035, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.4291018923022965, + "learning_rate": 5.899508750327501e-05, + "loss": 0.6857, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4819472695768693, + "learning_rate": 5.875881200614207e-05, + "loss": 0.7583, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.5463902606663359, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.6832, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.3861307644596196, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6511, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.41580726874585294, + "learning_rate": 5.80516544129337e-05, + "loss": 0.7147, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.44964197208891377, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7005, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.4021668915311515, + "learning_rate": 5.758162259883867e-05, + "loss": 0.6892, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.39523796525402727, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6929, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.38565520201311354, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.6682, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3995348928161467, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6982, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.4703906816473926, + "learning_rate": 5.664499159372017e-05, + "loss": 0.6853, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5033547096332919, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7224, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.38181669979362143, + "learning_rate": 5.617841757494762e-05, + "loss": 0.6383, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.48820392994934503, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7348, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.3970352873792021, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.6915, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4282462226980526, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7262, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.4131366154190306, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6542, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.426275032913804, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6986, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.4385390832068032, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.6932, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4337495818622833, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7575, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.466350734858001, + "learning_rate": 5.432402360355615e-05, + "loss": 0.7019, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4368116396485175, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6942, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.4126598144026732, + "learning_rate": 5.386346293357242e-05, + "loss": 0.7673, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.6288184741220477, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6909, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.45048610759488894, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.655, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4594592620204484, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7328, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.4338626551219975, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.6485, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.37899094255348686, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6566, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.437215121696517, + "learning_rate": 5.248926987065417e-05, + "loss": 0.6853, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3880565714552592, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6799, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.45764446660770025, + "learning_rate": 5.203374286747158e-05, + "loss": 0.6892, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4483748667859573, + "learning_rate": 5.180646201763577e-05, + "loss": 0.772, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.42973735024858706, + "learning_rate": 5.15795049724435e-05, + "loss": 0.6798, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.40383156607789245, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6659, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.8482268686810346, + "learning_rate": 5.112656839335543e-05, + "loss": 0.7819, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.41571178141097415, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7018, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.44792133357155567, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.6571, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.42746194302730683, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6657, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.5100274591863196, + "learning_rate": 5.022464783894744e-05, + "loss": 0.725, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.4676549283394495, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7094, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.4531270516170705, + "learning_rate": 4.977568810302432e-05, + "loss": 0.7445, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4702392246252879, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6441, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.46607399968829655, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.7198, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.8329031061656196, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7032, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.44674766634301843, + "learning_rate": 4.88818300430819e-05, + "loss": 0.7361, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3910474905924863, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6427, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.48318530717608854, + "learning_rate": 4.843695574177737e-05, + "loss": 0.7561, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.5740067638768978, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6773, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.46306313597699666, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.682, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4921344276105706, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6215, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.4480719423734006, + "learning_rate": 4.755137637685979e-05, + "loss": 0.7183, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.4333055760407282, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6872, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.4503796959024257, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.7445, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.5498040941736474, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7201, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.4647843613174905, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.7024, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.45936528866579146, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6426, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.6008042177834249, + "learning_rate": 4.623360864173893e-05, + "loss": 0.6195, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.40339065827934534, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.642, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.3889583369421051, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6752, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.42592145162890077, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6587, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.4644492058526666, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.6938, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3719718621862749, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6299, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.4915898843370229, + "learning_rate": 4.492884557078688e-05, + "loss": 0.6359, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.44205598693120685, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7051, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.4768946429172093, + "learning_rate": 4.449686911058992e-05, + "loss": 0.7462, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4654488939460009, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6449, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.4081539910008458, + "learning_rate": 4.406638431438576e-05, + "loss": 0.6738, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4803114921037021, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7085, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.43690457017312406, + "learning_rate": 4.36374027515878e-05, + "loss": 0.6948, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.5079444611869034, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6334, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.37947204367856424, + "learning_rate": 4.320993595120969e-05, + "loss": 0.6523, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.40617936016923767, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6936, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.41296487666566845, + "learning_rate": 4.278399540155536e-05, + "loss": 0.6852, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.3539471291866326, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6446, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.4560156604490935, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.7012, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.41704253075285647, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6852, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.38883821624010945, + "learning_rate": 4.193673880223339e-05, + "loss": 0.6869, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.39386521161668075, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6144, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.4376174108382858, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.6986, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4065493351088732, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6454, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.3941707213877182, + "learning_rate": 4.109572403415386e-05, + "loss": 0.6007, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4326973562326279, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6757, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.40634280380940946, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.6898, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.46060240732593877, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7121, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.39107008531256765, + "learning_rate": 4.026104150684835e-05, + "loss": 0.6439, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5568511298175615, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6982, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.38029120588041837, + "learning_rate": 3.984610290059467e-05, + "loss": 0.6581, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.43775333314753384, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6743, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.4223830486419374, + "learning_rate": 3.943278094912946e-05, + "loss": 0.6952, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4315052253643534, + "learning_rate": 3.922672969194686e-05, + "loss": 0.732, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.39178277582854004, + "learning_rate": 3.902108676060937e-05, + "loss": 0.6027, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.5627104260185086, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7304, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.39350483281651716, + "learning_rate": 3.861103139944449e-05, + "loss": 0.6866, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.4095154219580877, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6831, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.37354388758571105, + "learning_rate": 3.820262588600074e-05, + "loss": 0.6328, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3936880749526102, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6883, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.41527899802744356, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.6792, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.7595329157942547, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7183, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.4975754419510984, + "learning_rate": 3.739080826174498e-05, + "loss": 0.6937, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4521967913155814, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6655, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.33020318016896827, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.6167, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.5048026377794702, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7685, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.4491681253602928, + "learning_rate": 3.658572115866541e-05, + "loss": 0.7819, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.485749096981155, + "learning_rate": 3.638551118512089e-05, + "loss": 0.674, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.4095808744339379, + "learning_rate": 3.618572862711247e-05, + "loss": 0.6542, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.47721915371702767, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7347, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.47655723514124676, + "learning_rate": 3.578745112405083e-05, + "loss": 0.6334, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4032498972457474, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6865, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.4380646892148774, + "learning_rate": 3.539089935331294e-05, + "loss": 0.7206, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.41233642711261126, + "learning_rate": 3.519327394983888e-05, + "loss": 0.653, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.9220102122310604, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.7581, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4835730322570472, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6839, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.3673687066773582, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.682, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.44778735795246244, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7226, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.43417465922385345, + "learning_rate": 3.421170477595419e-05, + "loss": 0.721, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.5059754441089996, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7042, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.444759680951798, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.6797, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.41036003482243283, + "learning_rate": 3.362805697728145e-05, + "loss": 0.671, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.4546342656898402, + "learning_rate": 3.34343978560367e-05, + "loss": 0.7371, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.4698253064161849, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7089, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.5642074583381484, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.6529, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.4030917615643339, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6512, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.4641126026083197, + "learning_rate": 3.266424677350346e-05, + "loss": 0.6812, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.4730074993629064, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6292, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.4653932203462421, + "learning_rate": 3.228188057393895e-05, + "loss": 0.7085, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4017944821803112, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6597, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.38975962223847943, + "learning_rate": 3.190133432000252e-05, + "loss": 0.6442, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.5709031936190804, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.733, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.5317919974022999, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.7605, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.40589358111230983, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6684, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.5503734655213102, + "learning_rate": 3.114574250902558e-05, + "loss": 0.7868, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.4916943514014524, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7103, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.6020255347762185, + "learning_rate": 3.077071725875116e-05, + "loss": 0.7783, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.41380248929043034, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6904, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.38840340966320364, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.6757, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.36884808026100746, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6359, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.4638521732939306, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.7644, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4031733673863279, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6829, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.4455684228946421, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.6175, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.44512262470760516, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6551, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.4392274040280466, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7532, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3846213730566546, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6462, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.3908739955341278, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.6389, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.4210490998284046, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7032, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.5121144943279137, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.7509, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4276742573054915, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6979, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.35333439799515204, + "learning_rate": 2.819819423336775e-05, + "loss": 0.6707, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4204807321417563, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6767, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.48716658706423366, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.7501, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.38039866799337263, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6545, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.32672065605148515, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.5797, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.4076002889185543, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6728, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.38753220711205655, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6454, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.40540466182809887, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6637, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.4454018587849093, + "learning_rate": 2.677041764010988e-05, + "loss": 0.6776, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5611693656661421, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6607, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.3825257735895134, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.7003, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.40734098381589984, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6609, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.42583721416048925, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.6623, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.48805096685571947, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6934, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.42500877249350866, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.6513, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.40493089101612706, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.674, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.37284688840143426, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.683, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.3850534379139986, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.656, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.45394202888183943, + "learning_rate": 2.503004759861258e-05, + "loss": 0.661, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4689936892545102, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6613, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.4694811646835681, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.6819, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3980590952745821, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6348, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.3875275031787178, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.6382, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.47513560666438664, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6895, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.3563948006388464, + "learning_rate": 2.400992893100822e-05, + "loss": 0.6074, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.43436570592978274, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6693, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.42645668786967905, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.6854, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.42641985111716113, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7029, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.43453452583242996, + "learning_rate": 2.334004587234717e-05, + "loss": 0.6757, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5943868344640286, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7683, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.35667236842417455, + "learning_rate": 2.300819024631603e-05, + "loss": 0.7153, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4169268668868495, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6382, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.45900056490702645, + "learning_rate": 2.26784037992395e-05, + "loss": 0.7021, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3743573851538462, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6293, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.431799415704186, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.6307, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.443839386021764, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7189, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.457217602118605, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.7166, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.4298899530051766, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.659, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.44483076435674174, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.7343, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.35211899297553695, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.5832, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.7731949739466223, + "learning_rate": 2.138012622361689e-05, + "loss": 0.7262, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.41017806254830447, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6515, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.37979830419359745, + "learning_rate": 2.106081749751897e-05, + "loss": 0.6707, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.44717119588271975, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7163, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.4079752030933211, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.665, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.46165802291394364, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6828, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.4177470942809108, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.653, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.3817091854290759, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6909, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.5418869050582212, + "learning_rate": 2.011565445123711e-05, + "loss": 0.7432, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.44523306273336016, + "learning_rate": 1.995999968955641e-05, + "loss": 0.676, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.3619808539027808, + "learning_rate": 1.980488270378612e-05, + "loss": 0.5962, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5842481216946012, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7678, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.5461910209473555, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.6585, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.5270273146574822, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7617, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.5765724533332864, + "learning_rate": 1.918981330958678e-05, + "loss": 0.7621, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4174238770840169, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6699, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.5840180362864116, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.7183, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.43782142253088613, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7388, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.4143539040070012, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.6748, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.38610298875132026, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6717, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.4529466347927891, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.6332, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.4104044812469816, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.5814, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.3513567415760835, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.6406, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4581821739165075, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6671, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.49925155395470433, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.793, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.4938698306908264, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6664, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.4071602120149282, + "learning_rate": 1.739698775823442e-05, + "loss": 0.6133, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.470231428927306, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7382, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.41085096374471225, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.647, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.41451453515814146, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6763, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.4317718419901924, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.6626, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3739245136514371, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6631, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.49147832404649994, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.7476, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.39322438754960165, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6806, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.41150429736076116, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.6922, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.39706711639249687, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6346, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.49795204929118797, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.6942, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.38100589182329975, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6335, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.41974531064997533, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.6788, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3750730946030414, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6395, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.36268379646764537, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.6142, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.423716553799071, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6448, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.4942107069468341, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.6627, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.42456064348420397, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6894, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.36518083687152986, + "learning_rate": 1.485810737340767e-05, + "loss": 0.6189, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.4404730960139239, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6369, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.392567768076952, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.6081, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.4290000072468106, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.7304, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.4199022292164449, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.6237, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.4397419231813673, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7319, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.40755746261021175, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.6677, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.46991411782334597, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7943, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.4426440689555142, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.6747, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4348959755812302, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6185, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.433817258179641, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.6844, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.41814622171273114, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6268, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.47997402629270003, + "learning_rate": 1.326814704364262e-05, + "loss": 0.7138, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.4347748528844761, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6831, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.4010485407026081, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.6475, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.374961629875271, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6404, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.3457641040319451, + "learning_rate": 1.275673273546758e-05, + "loss": 0.6436, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3969026935113594, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6486, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.36695230119895345, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.6426, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.39454987021060967, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6672, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.38197236694832964, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.6689, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.5920091225654076, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7146, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.3814735307252148, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.6746, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.4533789513726459, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7033, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.38583837856569086, + "learning_rate": 1.176209418012495e-05, + "loss": 0.6677, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.454085290082888, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6362, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.4742972977466862, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.667, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4824677083631548, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7184, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.3578146858746713, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.5969, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.457944247105592, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6866, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.38401135991776364, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.6125, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.44540543819947204, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6726, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.437737845257865, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.7318, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.3709558725890715, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6437, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.44814860250802013, + "learning_rate": 1.057219974130903e-05, + "loss": 0.6883, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.46865254426251146, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6647, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.5167131246294455, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.7374, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.3890793000392378, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6978, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.46258858660726937, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.7821, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4845841428225895, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7329, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.44073517080169233, + "learning_rate": 9.887052838721322e-06, + "loss": 0.6542, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4424170136027106, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7131, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.4289198341530067, + "learning_rate": 9.663506046162985e-06, + "loss": 0.6492, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.48072495328863873, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6587, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.4320187801746069, + "learning_rate": 9.44238707511862e-06, + "loss": 0.6444, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.4561240308758049, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7246, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.6087580846063493, + "learning_rate": 9.22370186822965e-06, + "loss": 0.7112, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.4530540900311635, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6826, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.473302927557411, + "learning_rate": 9.0074563027294e-06, + "loss": 0.6788, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.39430156773094005, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6492, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.38741073903837653, + "learning_rate": 8.79365619028507e-06, + "loss": 0.6516, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4992858738222001, + "learning_rate": 8.687674977138116e-06, + "loss": 0.7291, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.4448439619855907, + "learning_rate": 8.582307276841462e-06, + "loss": 0.681, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.4043429402789075, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7083, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.4872191603346239, + "learning_rate": 8.37341524246672e-06, + "loss": 0.7314, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.37327936693140723, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6852, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.4573379195008329, + "learning_rate": 8.166985701199582e-06, + "loss": 0.623, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.45019933175472543, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7033, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.4188359341478893, + "learning_rate": 7.963024200898462e-06, + "loss": 0.717, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4867025160886115, + "learning_rate": 7.861970681683051e-06, + "loss": 0.7638, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.3828268279750639, + "learning_rate": 7.761536223092458e-06, + "loss": 0.6223, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.44475127573228324, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6524, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.38603117627268635, + "learning_rate": 7.562527182833978e-06, + "loss": 0.6467, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.4514119122712399, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7004, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.42230542927567644, + "learning_rate": 7.366002428553153e-06, + "loss": 0.6272, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.4091204371094442, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.7014, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.35972169908969026, + "learning_rate": 7.171967241914224e-06, + "loss": 0.5502, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4222850189851346, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6569, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.3910537015498819, + "learning_rate": 6.980426837673437e-06, + "loss": 0.5971, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.47414596155603655, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.7066, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.3863146078649115, + "learning_rate": 6.791386363539065e-06, + "loss": 0.6429, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.387325530291395, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7361, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.5225291485970791, + "learning_rate": 6.604850900032955e-06, + "loss": 0.6583, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.4506614605781981, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6544, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.4562971013829592, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6772, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4478968361283968, + "learning_rate": 6.329755547632499e-06, + "loss": 0.634, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.625845777935862, + "learning_rate": 6.239314990243339e-06, + "loss": 0.7035, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.37214414526022493, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6108, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.4092861210799874, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.7133, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.40098242819927743, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6562, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.4600776408862824, + "learning_rate": 5.883858403607967e-06, + "loss": 0.7029, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5058924463716198, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6528, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.39735021532152043, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.6804, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.4445361120546298, + "learning_rate": 5.623903547074549e-06, + "loss": 0.658, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.46753041919202154, + "learning_rate": 5.538519351897575e-06, + "loss": 0.6155, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.41436384791158104, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6891, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.4077205837912534, + "learning_rate": 5.369655545525909e-06, + "loss": 0.6523, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.466454862461288, + "learning_rate": 5.286177068899989e-06, + "loss": 0.678, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.409423738849171, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.7214, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4748124598050134, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6449, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.3871256574050166, + "learning_rate": 5.039562062965508e-06, + "loss": 0.67, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.46831718930627114, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6883, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.37504315446264486, + "learning_rate": 4.87834125814235e-06, + "loss": 0.669, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.47688428959244344, + "learning_rate": 4.798689246727006e-06, + "loss": 0.665, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.3619539400651978, + "learning_rate": 4.719676877632639e-06, + "loss": 0.6505, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4594127274377102, + "learning_rate": 4.641304681730641e-06, + "loss": 0.657, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.4280258838906803, + "learning_rate": 4.563573185591219e-06, + "loss": 0.6714, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.4034978688949853, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6796, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.399767578959943, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.5946, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.36537304243469054, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6483, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.4429827195089348, + "learning_rate": 4.259064579323302e-06, + "loss": 0.6358, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.503755620863335, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7091, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.44931627730300316, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.6844, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.355137014163896, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6883, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.4751342717070414, + "learning_rate": 3.964848174174541e-06, + "loss": 0.7175, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.3934305241762361, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6305, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.4284674206520445, + "learning_rate": 3.821609474213983e-06, + "loss": 0.6263, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.5304078370298239, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6128, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.3838543952700125, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.6596, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.47214302246291556, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6835, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.4193805545544715, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.7442, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.46391645022575856, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6458, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.40499092839719736, + "learning_rate": 3.40741737109318e-06, + "loss": 0.6886, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.3987988642580818, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.698, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.427183617651861, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.7051, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4443270212370436, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7027, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.4471706882159882, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.6577, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.48972079173069394, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.682, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.5065267660417246, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.6782, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5069107515729039, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7146, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.40254688697544516, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.6141, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.47425515127829737, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6517, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.3999312335834582, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6731, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.50007730288023, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7449, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.390957345630411, + "learning_rate": 2.649217248223468e-06, + "loss": 0.6375, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.435374353883707, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6349, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.446772375933306, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6734, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.46141118177489704, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7738, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.40101262069842275, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.6909, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4401243609537745, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6995, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.4067423394336151, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.6638, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.38393597416443764, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6797, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.47557262884475343, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.7142, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.41047225341638655, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5714, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.36385385554164773, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.6242, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.430047639568558, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7213, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.4003759824175224, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.633, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.5276516864778331, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7372, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.37424234084134184, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.6498, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.39092128322326236, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6578, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.40114954765241456, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.6324, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.5654401298903662, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.7025, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.4565313925452614, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6726, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.45063897696530186, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.7099, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.5070284990026516, + "learning_rate": 1.595161589389449e-06, + "loss": 0.6629, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.4623797593188141, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.677, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.3542532180522851, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.6542, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.5165101099155814, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6961, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.41578784903230226, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.6144, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.4262885168527811, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6376, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.5125292108606694, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.6715, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.43702204246654686, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6939, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.37114897690493825, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.6775, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.4435858684842088, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.683, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.36459331021956454, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.6328, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.39769689176348383, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6787, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.4392491742426648, + "learning_rate": 1.089491988176017e-06, + "loss": 0.6535, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4802610081280981, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.767, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.4972405040039743, + "learning_rate": 1.014505010326583e-06, + "loss": 0.6684, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4300061019366573, + "learning_rate": 9.780089980330642e-07, + "loss": 0.7004, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.37919684478557075, + "learning_rate": 9.421782985976068e-07, + "loss": 0.6371, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.39152226247770316, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6349, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.41967373082518133, + "learning_rate": 8.725137967920738e-07, + "loss": 0.7265, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.4122962224075846, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6585, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.4436547161906095, + "learning_rate": 8.055133771652345e-07, + "loss": 0.7092, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.4964147457189863, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6714, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.4106307093293199, + "learning_rate": 7.411788403743237e-07, + "loss": 0.6533, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4716435870719622, + "learning_rate": 7.100118211581852e-07, + "loss": 0.7295, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.41576863897973987, + "learning_rate": 6.7951191543012e-07, + "loss": 0.7056, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.43674970561506565, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7308, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.3316605911176218, + "learning_rate": 6.205142596505176e-07, + "loss": 0.5981, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4410517556841685, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6245, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.46086759578868924, + "learning_rate": 5.64187458615939e-07, + "loss": 0.6425, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.4266740889501166, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6741, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.46234931271388335, + "learning_rate": 5.105330261267916e-07, + "loss": 0.623, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.41902737827351866, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6583, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.3591130098650748, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.6731, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.41096488750723664, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.7105, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.45856275241674943, + "learning_rate": 4.112469628438365e-07, + "loss": 0.6413, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.5754725420679759, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6829, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.47553566714873646, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.7288, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.43786113087863426, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6404, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.46596646225893884, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.7697, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.4172807297480227, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6499, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.4217294242714277, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.6422, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4902760524078681, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6386, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.42289899866497593, + "learning_rate": 2.448018893333681e-07, + "loss": 0.6917, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.38391404447307315, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6692, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.4028401645223473, + "learning_rate": 2.098903854912515e-07, + "loss": 0.6718, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3783484008933337, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.5929, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.4203126292842415, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.5779, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3733061190802989, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6366, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.44571185691890597, + "learning_rate": 1.481139151579991e-07, + "loss": 0.6681, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.39590497993421114, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6524, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.4595685602779222, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6217, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.406658460916925, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6682, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.5966985871698207, + "learning_rate": 9.707157531134713e-08, + "loss": 0.7496, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.37185100028454815, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7142, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.5722795980301746, + "learning_rate": 7.557746412468758e-08, + "loss": 0.6986, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4423403708247473, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6844, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.3670252792367035, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6368, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.3600132217622975, + "learning_rate": 4.837177080119215e-08, + "loss": 0.632, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.45259612096189367, + "learning_rate": 4.064624751394242e-08, + "loss": 0.6995, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.36480194691598306, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6072, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.43766481771460525, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.6251, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.49008337751938974, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7103, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.3860677060738148, + "learning_rate": 1.646071422083395e-08, + "loss": 0.6531, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.41635945899168325, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6685, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.4076160725177934, + "learning_rate": 8.398436437317969e-09, + "loss": 0.7144, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4075007837183193, + "learning_rate": 5.375026405352035e-09, + "loss": 0.654, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.5147838413019985, + "learning_rate": 3.023464202944748e-09, + "loss": 0.6378, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.41380566659647616, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.586, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.49445333550351556, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.6943, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.38169510328559775, + "learning_rate": 0.0, + "loss": 0.6749, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1075826133532672.0, + "train_loss": 0.7417626914024353, + "train_runtime": 19163.927, + "train_samples_per_second": 1.044, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1075826133532672.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e285932c220f7796bbe215fa382561159e889c97 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "q_proj", + "o_proj", + "up_proj", + "k_proj", + "down_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5ce1a0edb6da7c60c207707536af4faddd79ce7a --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e997f60d69e8d2b862c702246d6d2c85054d52b24dbf80c5fcfcbb271f47989 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..dc9e219fa04676b7752adb85660fa1e8db64e8ac --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857349115ca01448b46955f52ddcdcf4423c3f631d63ddc92e6e332bedfb0181 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8c290efbac5381465108628152698e1f7217eebd --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.9747163093624516, + "learning_rate": 5.263157894736842e-06, + "loss": 1.3776, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.9971316137974952, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.3593, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 1.0294064859851075, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.4638, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8785473701775477, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2433, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.9222189298141704, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.3731, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.9510475170175133, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3591, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.7084226276467224, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.0935, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.906132971294305, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1725, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.7712213021772553, + "learning_rate": 4.736842105263158e-05, + "loss": 1.0772, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 1.3178145410160216, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.1589, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.850994370222525, + "learning_rate": 5.789473684210527e-05, + "loss": 1.0119, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7447680537725399, + "learning_rate": 6.31578947368421e-05, + "loss": 0.9941, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8413526953194138, + "learning_rate": 6.842105263157895e-05, + "loss": 1.0714, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.6570030364761191, + "learning_rate": 7.368421052631579e-05, + "loss": 0.967, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 1.049204357937229, + "learning_rate": 7.894736842105263e-05, + "loss": 0.9898, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6319707351650328, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9134, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.6526982002329986, + "learning_rate": 8.947368421052632e-05, + "loss": 0.9306, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5434970784785387, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8736, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.5725189393741179, + "learning_rate": 0.0001, + "loss": 0.9179, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.5302888010676386, + "learning_rate": 0.00010526315789473685, + "loss": 0.8855, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6382052813053475, + "learning_rate": 0.0001105263157894737, + "loss": 0.9381, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6044670064868235, + "learning_rate": 0.00011578947368421053, + "loss": 0.9268, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5627824401835214, + "learning_rate": 0.00012105263157894738, + "loss": 0.894, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.4535652110015434, + "learning_rate": 0.0001263157894736842, + "loss": 0.8492, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.6327582382001423, + "learning_rate": 0.00013157894736842108, + "loss": 0.8887, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5317494644379498, + "learning_rate": 0.0001368421052631579, + "loss": 0.8935, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.5122808745979125, + "learning_rate": 0.00014210526315789474, + "loss": 0.9313, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5794191539976313, + "learning_rate": 0.00014736842105263158, + "loss": 0.9305, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.6283683198317399, + "learning_rate": 0.00015263157894736845, + "loss": 1.0035, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.5306201579346713, + "learning_rate": 0.00015789473684210527, + "loss": 0.8103, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.5486303280090739, + "learning_rate": 0.0001631578947368421, + "loss": 0.8286, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4823708388104478, + "learning_rate": 0.00016842105263157895, + "loss": 0.9333, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.594896847756731, + "learning_rate": 0.0001736842105263158, + "loss": 0.8822, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.49095929767947233, + "learning_rate": 0.00017894736842105264, + "loss": 0.8244, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.5154915662155016, + "learning_rate": 0.00018421052631578948, + "loss": 0.8675, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.521265502362853, + "learning_rate": 0.00018947368421052632, + "loss": 0.9447, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.4891594600069159, + "learning_rate": 0.00019473684210526317, + "loss": 0.8823, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.591233346119749, + "learning_rate": 0.0002, + "loss": 0.8902, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.49124794236045227, + "learning_rate": 0.00019999966405802826, + "loss": 0.8059, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.5758637086666665, + "learning_rate": 0.00019999865623437013, + "loss": 0.8825, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.4786735766305452, + "learning_rate": 0.00019999697653579705, + "loss": 0.9047, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5528332925969597, + "learning_rate": 0.00019999462497359466, + "loss": 0.894, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.5055203047790537, + "learning_rate": 0.0001999916015635627, + "loss": 0.7982, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6393093677384203, + "learning_rate": 0.00019998790632601496, + "loss": 0.8814, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.47679644352584477, + "learning_rate": 0.00019998353928577919, + "loss": 0.8093, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.501127789032324, + "learning_rate": 0.0001999785004721968, + "loss": 0.898, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.49935168203483027, + "learning_rate": 0.0001999727899191228, + "loss": 0.8727, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5407161169427772, + "learning_rate": 0.00019996640766492543, + "loss": 0.9324, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.5393844437050201, + "learning_rate": 0.00019995935375248606, + "loss": 0.8809, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.4432871996924282, + "learning_rate": 0.00019995162822919883, + "loss": 0.8086, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.4596088706929705, + "learning_rate": 0.00019994323114697022, + "loss": 0.8306, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5084204356374836, + "learning_rate": 0.00019993416256221895, + "loss": 0.8601, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.5353499362354399, + "learning_rate": 0.0001999244225358753, + "loss": 0.8785, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.6953583786517583, + "learning_rate": 0.00019991401113338104, + "loss": 0.8999, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.45453813559310047, + "learning_rate": 0.00019990292842468868, + "loss": 0.8058, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5955649216955119, + "learning_rate": 0.00019989117448426108, + "loss": 0.9337, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.467998115167168, + "learning_rate": 0.0001998787493910712, + "loss": 0.7969, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5694988438877551, + "learning_rate": 0.00019986565322860115, + "loss": 0.803, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.4993003636229634, + "learning_rate": 0.000199851886084842, + "loss": 0.8904, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.5265132557079555, + "learning_rate": 0.00019983744805229296, + "loss": 0.9117, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.5260799370031813, + "learning_rate": 0.00019982233922796085, + "loss": 0.8068, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.534529456433131, + "learning_rate": 0.00019980655971335945, + "loss": 0.8493, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.5484845423317868, + "learning_rate": 0.00019979010961450878, + "loss": 0.9084, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4717299830891815, + "learning_rate": 0.00019977298904193437, + "loss": 0.8762, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.5320054509994052, + "learning_rate": 0.00019975519811066663, + "loss": 0.867, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4968147872972133, + "learning_rate": 0.00019973673694024, + "loss": 0.8246, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.4556804959525972, + "learning_rate": 0.0001997176056546921, + "loss": 0.756, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.6376392860317438, + "learning_rate": 0.00019969780438256293, + "loss": 0.9274, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.503563685108971, + "learning_rate": 0.0001996773332568941, + "loss": 0.818, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.4332359635935246, + "learning_rate": 0.0001996561924152278, + "loss": 0.7394, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.5442703744579824, + "learning_rate": 0.00019963438199960599, + "loss": 0.8818, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5029957297136984, + "learning_rate": 0.0001996119021565693, + "loss": 0.8752, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.471330634112903, + "learning_rate": 0.00019958875303715615, + "loss": 0.8373, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.48541377554141146, + "learning_rate": 0.0001995649347969019, + "loss": 0.8262, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.7278423656673401, + "learning_rate": 0.0001995404475958373, + "loss": 0.9023, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.46571363881804856, + "learning_rate": 0.00019951529159848805, + "loss": 0.7964, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.48225090517511315, + "learning_rate": 0.0001994894669738732, + "loss": 0.7375, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.46463249793100286, + "learning_rate": 0.00019946297389550433, + "loss": 0.8404, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.5176022140990233, + "learning_rate": 0.0001994358125413841, + "loss": 0.8608, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.4640933574111419, + "learning_rate": 0.00019940798309400526, + "loss": 0.8464, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.5049737620655329, + "learning_rate": 0.0001993794857403495, + "loss": 0.8597, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.4000446338602807, + "learning_rate": 0.0001993503206718859, + "loss": 0.7712, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.5421156886912685, + "learning_rate": 0.0001993204880845699, + "loss": 0.8473, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.43785601117177925, + "learning_rate": 0.00019928998817884182, + "loss": 0.7764, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.5777032565618996, + "learning_rate": 0.00019925882115962568, + "loss": 0.8354, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5102041532130068, + "learning_rate": 0.00019922698723632767, + "loss": 0.8778, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.5812015512689956, + "learning_rate": 0.00019919448662283478, + "loss": 0.8561, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.585857224143132, + "learning_rate": 0.00019916131953751342, + "loss": 0.8318, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.448955209649677, + "learning_rate": 0.00019912748620320794, + "loss": 0.8565, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.45471013020840184, + "learning_rate": 0.00019909298684723904, + "loss": 0.7553, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.4753483930706963, + "learning_rate": 0.00019905782170140238, + "loss": 0.7902, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5429707171943804, + "learning_rate": 0.00019902199100196697, + "loss": 0.7649, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.4966628028743198, + "learning_rate": 0.00019898549498967343, + "loss": 0.8799, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.5006757379214986, + "learning_rate": 0.00019894833390973266, + "loss": 0.8537, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.48092851201041953, + "learning_rate": 0.000198910508011824, + "loss": 0.8285, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.7589312779979084, + "learning_rate": 0.00019887201755009357, + "loss": 0.9664, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.4798855694434043, + "learning_rate": 0.00019883286278315262, + "loss": 0.7525, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5167176020578753, + "learning_rate": 0.0001987930439740757, + "loss": 0.8533, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.45898646958816514, + "learning_rate": 0.00019875256139039902, + "loss": 0.8272, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.5130366805408321, + "learning_rate": 0.00019871141530411853, + "loss": 0.8246, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.5382568732867721, + "learning_rate": 0.00019866960599168826, + "loss": 0.8248, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4654109966764274, + "learning_rate": 0.0001986271337340182, + "loss": 0.8667, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.5335849522006713, + "learning_rate": 0.0001985839988164726, + "loss": 0.8978, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5234000189758574, + "learning_rate": 0.00019854020152886814, + "loss": 0.8356, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.43548891193506006, + "learning_rate": 0.00019849574216547171, + "loss": 0.719, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5086125572177761, + "learning_rate": 0.0001984506210249986, + "loss": 0.8092, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.4834049587544824, + "learning_rate": 0.00019840483841061058, + "loss": 0.785, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5223605888622092, + "learning_rate": 0.00019835839462991361, + "loss": 0.8144, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.5897705067900536, + "learning_rate": 0.00019831128999495606, + "loss": 0.9088, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.46431911679447213, + "learning_rate": 0.00019826352482222638, + "loss": 0.8309, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.504224351663702, + "learning_rate": 0.0001982150994326511, + "loss": 0.8071, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.547454671929799, + "learning_rate": 0.00019816601415159263, + "loss": 0.7324, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.514111241073896, + "learning_rate": 0.0001981162693088471, + "loss": 0.8113, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.47314412685832746, + "learning_rate": 0.0001980658652386421, + "loss": 0.823, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.4863065522426431, + "learning_rate": 0.0001980148022796345, + "loss": 0.78, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4875828457610195, + "learning_rate": 0.00019796308077490817, + "loss": 0.7818, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.485080279739068, + "learning_rate": 0.00019791070107197153, + "loss": 0.8017, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4550894931203216, + "learning_rate": 0.00019785766352275542, + "loss": 0.847, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.4707139910339425, + "learning_rate": 0.0001978039684836106, + "loss": 0.7968, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.977513875288363, + "learning_rate": 0.00019774961631530545, + "loss": 0.9965, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.45037237678267117, + "learning_rate": 0.0001976946073830234, + "loss": 0.7461, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4609004148948183, + "learning_rate": 0.00019763894205636072, + "loss": 0.8684, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.5405117938056486, + "learning_rate": 0.00019758262070932375, + "loss": 0.9122, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4445883762341974, + "learning_rate": 0.00019752564372032657, + "loss": 0.7881, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.5076132174866915, + "learning_rate": 0.00019746801147218842, + "loss": 0.8639, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.6126248956606299, + "learning_rate": 0.00019740972435213115, + "loss": 0.8841, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.46261186141768107, + "learning_rate": 0.00019735078275177654, + "loss": 0.8062, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.43630485280685666, + "learning_rate": 0.00019729118706714375, + "loss": 0.7864, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.43348950728673896, + "learning_rate": 0.00019723093769864663, + "loss": 0.7752, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.5265221514244094, + "learning_rate": 0.00019717003505109095, + "loss": 0.8304, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.6271969859011378, + "learning_rate": 0.0001971084795336719, + "loss": 0.7354, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5166949010390673, + "learning_rate": 0.00019704627155997108, + "loss": 0.7814, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.4822743087473739, + "learning_rate": 0.00019698341154795389, + "loss": 0.7615, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5450969403265944, + "learning_rate": 0.00019691989991996663, + "loss": 0.9126, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.4516497659644016, + "learning_rate": 0.00019685573710273376, + "loss": 0.7809, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.6237163004882903, + "learning_rate": 0.0001967909235273549, + "loss": 0.9205, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.47876327682526937, + "learning_rate": 0.00019672545962930215, + "loss": 0.8275, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5395791344208695, + "learning_rate": 0.00019665934584841682, + "loss": 0.795, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.5067725433393416, + "learning_rate": 0.00019659258262890683, + "loss": 0.868, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.48354919242852473, + "learning_rate": 0.00019652517041934356, + "loss": 0.8144, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.5045229343088378, + "learning_rate": 0.00019645710967265882, + "loss": 0.7937, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4501365842401607, + "learning_rate": 0.00019638840084614182, + "loss": 0.7669, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.4654460898472039, + "learning_rate": 0.00019631904440143612, + "loss": 0.8003, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.49265876580137014, + "learning_rate": 0.00019624904080453655, + "loss": 0.7711, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.4196755153909947, + "learning_rate": 0.00019617839052578603, + "loss": 0.6961, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4730379510588242, + "learning_rate": 0.00019610709403987246, + "loss": 0.8488, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.4179085152791245, + "learning_rate": 0.0001960351518258255, + "loss": 0.7634, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4504927184964666, + "learning_rate": 0.00019596256436701324, + "loss": 0.769, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.47912198033551545, + "learning_rate": 0.00019588933215113926, + "loss": 0.833, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.46676570068667694, + "learning_rate": 0.000195815455670239, + "loss": 0.7524, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.45374835180690026, + "learning_rate": 0.00019574093542067673, + "loss": 0.7972, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.49780694456258046, + "learning_rate": 0.00019566577190314197, + "loss": 0.782, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.46717860780736153, + "learning_rate": 0.0001955899656226464, + "loss": 0.8309, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4727836597310324, + "learning_rate": 0.0001955135170885202, + "loss": 0.7552, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.48113301659668845, + "learning_rate": 0.0001954364268144088, + "loss": 0.7565, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4320875610697008, + "learning_rate": 0.00019535869531826937, + "loss": 0.7598, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.45990659802037936, + "learning_rate": 0.00019528032312236736, + "loss": 0.7104, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.46416518781984156, + "learning_rate": 0.00019520131075327298, + "loss": 0.708, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.45461241381325634, + "learning_rate": 0.00019512165874185767, + "loss": 0.7886, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.4395339059749342, + "learning_rate": 0.00019504136762329047, + "loss": 0.7879, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.45283536508024796, + "learning_rate": 0.0001949604379370345, + "loss": 0.7552, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.48514938779893596, + "learning_rate": 0.00019487887022684336, + "loss": 0.8396, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.45290479179907883, + "learning_rate": 0.00019479666504075736, + "loss": 0.8046, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.48291651301776906, + "learning_rate": 0.00019471382293110003, + "loss": 0.8381, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.43975408072203875, + "learning_rate": 0.0001946303444544741, + "loss": 0.7761, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.5494387560382519, + "learning_rate": 0.00019454623017175812, + "loss": 0.8345, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.44048794546963965, + "learning_rate": 0.00019446148064810242, + "loss": 0.8619, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.42475661368960177, + "learning_rate": 0.00019437609645292546, + "loss": 0.7788, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.47211544022836005, + "learning_rate": 0.00019429007815990993, + "loss": 0.8035, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.41005411244101797, + "learning_rate": 0.0001942034263469989, + "loss": 0.7713, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.4798370923401185, + "learning_rate": 0.00019411614159639204, + "loss": 0.7771, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5489860373506683, + "learning_rate": 0.00019402822449454153, + "loss": 0.7399, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.5949651572236011, + "learning_rate": 0.00019393967563214833, + "loss": 0.7944, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5103101440970763, + "learning_rate": 0.00019385049560415794, + "loss": 0.804, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.4416216088387608, + "learning_rate": 0.00019376068500975667, + "loss": 0.769, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.42314990470772496, + "learning_rate": 0.00019367024445236754, + "loss": 0.7831, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.4364464883815751, + "learning_rate": 0.000193579174539646, + "loss": 0.7723, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.41870456712768306, + "learning_rate": 0.00019348747588347637, + "loss": 0.7646, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.4325493067333497, + "learning_rate": 0.00019339514909996706, + "loss": 0.7895, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.5937916976258861, + "learning_rate": 0.00019330219480944694, + "loss": 0.8872, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.46076168375250054, + "learning_rate": 0.00019320861363646095, + "loss": 0.7762, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4719415378217879, + "learning_rate": 0.00019311440620976597, + "loss": 0.8442, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.41742393603444783, + "learning_rate": 0.00019301957316232658, + "loss": 0.7724, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5606094174530225, + "learning_rate": 0.0001929241151313108, + "loss": 0.8143, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.5123057991525729, + "learning_rate": 0.0001928280327580858, + "loss": 0.8788, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.5478849614118967, + "learning_rate": 0.00019273132668821364, + "loss": 0.8339, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.5053262511022109, + "learning_rate": 0.00019263399757144683, + "loss": 0.7908, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4843201758268366, + "learning_rate": 0.00019253604606172417, + "loss": 0.8058, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.4329137931409849, + "learning_rate": 0.000192437472817166, + "loss": 0.7424, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.48018937475308404, + "learning_rate": 0.00019233827850007027, + "loss": 0.8026, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.5833152367128422, + "learning_rate": 0.00019223846377690754, + "loss": 0.8433, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4464496239722511, + "learning_rate": 0.00019213802931831696, + "loss": 0.7725, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.4501347749823692, + "learning_rate": 0.00019203697579910154, + "loss": 0.7558, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4389798585604824, + "learning_rate": 0.00019193530389822363, + "loss": 0.752, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.47661997805132117, + "learning_rate": 0.00019183301429880043, + "loss": 0.7626, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5413153605782053, + "learning_rate": 0.00019173010768809933, + "loss": 0.7964, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.4566333020208942, + "learning_rate": 0.00019162658475753327, + "loss": 0.7959, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4621358452276674, + "learning_rate": 0.0001915224462026563, + "loss": 0.8675, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.48862296044982606, + "learning_rate": 0.00019141769272315858, + "loss": 0.7648, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.4753951660463967, + "learning_rate": 0.00019131232502286188, + "loss": 0.8307, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.5141194737862564, + "learning_rate": 0.00019120634380971496, + "loss": 0.7077, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.45234333639641944, + "learning_rate": 0.0001910997497957885, + "loss": 0.8031, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.4515668456684562, + "learning_rate": 0.0001909925436972706, + "loss": 0.731, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5737095843420752, + "learning_rate": 0.00019088472623446183, + "loss": 0.8357, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.535951024744029, + "learning_rate": 0.00019077629813177036, + "loss": 0.8049, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.4511706176519065, + "learning_rate": 0.00019066726011770726, + "loss": 0.7383, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.46078484358501814, + "learning_rate": 0.00019055761292488142, + "loss": 0.7057, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4615355759209768, + "learning_rate": 0.0001904473572899947, + "loss": 0.7867, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.4251775169661483, + "learning_rate": 0.00019033649395383702, + "loss": 0.7808, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.5973341731981406, + "learning_rate": 0.00019022502366128135, + "loss": 0.822, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.5855646612286085, + "learning_rate": 0.00019011294716127867, + "loss": 0.8053, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.41727662450704456, + "learning_rate": 0.00019000026520685302, + "loss": 0.7663, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.47845229997592925, + "learning_rate": 0.0001898869785550963, + "loss": 0.838, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.5489140143070973, + "learning_rate": 0.0001897730879671634, + "loss": 0.8267, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.467885417875514, + "learning_rate": 0.00018965859420826684, + "loss": 0.7373, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.479561570863446, + "learning_rate": 0.00018954349804767184, + "loss": 0.7829, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.45746076239178723, + "learning_rate": 0.00018942780025869098, + "loss": 0.8088, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.507631601408242, + "learning_rate": 0.00018931150161867916, + "loss": 0.8016, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.430451314261295, + "learning_rate": 0.00018919460290902826, + "loss": 0.7789, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.4624490047965098, + "learning_rate": 0.00018907710491516199, + "loss": 0.6833, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.5668639057309102, + "learning_rate": 0.0001889590084265304, + "loss": 0.8195, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.5179540680889814, + "learning_rate": 0.0001888403142366049, + "loss": 0.7803, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.466776035445987, + "learning_rate": 0.0001887210231428727, + "loss": 0.787, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4677485384390769, + "learning_rate": 0.00018860113594683148, + "loss": 0.8059, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.5200907173542695, + "learning_rate": 0.0001884806534539841, + "loss": 0.8346, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5263360649310521, + "learning_rate": 0.00018835957647383303, + "loss": 0.8515, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.5092342050626046, + "learning_rate": 0.0001882379058198751, + "loss": 0.78, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.45337247035771755, + "learning_rate": 0.00018811564230959588, + "loss": 0.7766, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.48261592566978495, + "learning_rate": 0.00018799278676446423, + "loss": 0.7214, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.4936136954755711, + "learning_rate": 0.00018786934000992688, + "loss": 0.7524, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.4352438974202268, + "learning_rate": 0.00018774530287540278, + "loss": 0.8186, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5337821387174219, + "learning_rate": 0.00018762067619427746, + "loss": 0.8484, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.5624469243195404, + "learning_rate": 0.00018749546080389757, + "loss": 0.846, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.4029135127012289, + "learning_rate": 0.00018736965754556528, + "loss": 0.7663, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.5380877662892091, + "learning_rate": 0.00018724326726453244, + "loss": 0.8101, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5722178777673009, + "learning_rate": 0.00018711629080999504, + "loss": 0.7683, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.5379554537100044, + "learning_rate": 0.00018698872903508755, + "loss": 0.8834, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.5568478872582503, + "learning_rate": 0.00018686058279687698, + "loss": 0.8667, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.5577808444837524, + "learning_rate": 0.0001867318529563574, + "loss": 0.7033, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.4962622589622996, + "learning_rate": 0.00018660254037844388, + "loss": 0.7538, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.46147950362919277, + "learning_rate": 0.00018647264593196688, + "loss": 0.7837, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.5015852768900653, + "learning_rate": 0.00018634217048966637, + "loss": 0.824, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.46416418218549393, + "learning_rate": 0.00018621111492818585, + "loss": 0.7756, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.497949749763509, + "learning_rate": 0.0001860794801280666, + "loss": 0.8204, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.6079261447552436, + "learning_rate": 0.00018594726697374175, + "loss": 0.8981, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.526743695859469, + "learning_rate": 0.0001858144763535302, + "loss": 0.8355, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.46954162920695786, + "learning_rate": 0.0001856811091596308, + "loss": 0.7522, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4901504125403584, + "learning_rate": 0.0001855471662881164, + "loss": 0.8489, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.4698272073878121, + "learning_rate": 0.00018541264863892754, + "loss": 0.824, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.43217871238602223, + "learning_rate": 0.00018527755711586678, + "loss": 0.8172, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.47857698799982135, + "learning_rate": 0.00018514189262659235, + "loss": 0.8286, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.46748873118066614, + "learning_rate": 0.00018500565608261214, + "loss": 0.8197, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.5100842368597939, + "learning_rate": 0.00018486884839927768, + "loss": 0.8053, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5672187705898414, + "learning_rate": 0.00018473147049577774, + "loss": 0.7568, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.4526590684351888, + "learning_rate": 0.0001845935232951325, + "loss": 0.7985, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.46330274343290273, + "learning_rate": 0.00018445500772418697, + "loss": 0.8475, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.4725498131891526, + "learning_rate": 0.00018431592471360503, + "loss": 0.7905, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4428375291128829, + "learning_rate": 0.00018417627519786315, + "loss": 0.7589, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.5337519447527753, + "learning_rate": 0.000184036060115244, + "loss": 0.8534, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.47327745791785747, + "learning_rate": 0.00018389528040783012, + "loss": 0.7134, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.44948160295387496, + "learning_rate": 0.00018375393702149787, + "loss": 0.7707, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5570473802648406, + "learning_rate": 0.00018361203090591071, + "loss": 0.8728, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.5308676094602198, + "learning_rate": 0.00018346956301451304, + "loss": 0.8574, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5321765551025057, + "learning_rate": 0.00018332653430452376, + "loss": 0.8354, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.46111601239828837, + "learning_rate": 0.00018318294573692985, + "loss": 0.7851, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4354857475616162, + "learning_rate": 0.00018303879827647975, + "loss": 0.7556, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.47259238349750865, + "learning_rate": 0.0001828940928916772, + "loss": 0.7517, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4477930536139435, + "learning_rate": 0.00018274883055477436, + "loss": 0.7668, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.5230405087481416, + "learning_rate": 0.00018260301224176558, + "loss": 0.822, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.4631954532874808, + "learning_rate": 0.00018245663893238075, + "loss": 0.7812, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.41168641496446173, + "learning_rate": 0.00018230971161007853, + "loss": 0.8112, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4832045243793889, + "learning_rate": 0.00018216223126204007, + "loss": 0.7872, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.39738429395928343, + "learning_rate": 0.00018201419887916214, + "loss": 0.7565, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.6618431513711611, + "learning_rate": 0.00018186561545605054, + "loss": 0.8073, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.6426765850504291, + "learning_rate": 0.00018171648199101346, + "loss": 0.8753, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4880857522804306, + "learning_rate": 0.00018156679948605467, + "loss": 0.764, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.45007102188173415, + "learning_rate": 0.00018141656894686689, + "loss": 0.7981, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4220971270876302, + "learning_rate": 0.00018126579138282503, + "loss": 0.7344, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.493693810778044, + "learning_rate": 0.00018111446780697929, + "loss": 0.8367, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.39592572323090214, + "learning_rate": 0.0001809625992360485, + "loss": 0.7349, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.592672121309063, + "learning_rate": 0.00018081018669041324, + "loss": 0.9222, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.48853563828611635, + "learning_rate": 0.00018065723119410884, + "loss": 0.7647, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.5099885648250655, + "learning_rate": 0.00018050373377481878, + "loss": 0.8057, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.47344188754976313, + "learning_rate": 0.00018034969546386757, + "loss": 0.7374, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.5450900217778545, + "learning_rate": 0.0001801951172962139, + "loss": 0.8305, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.500235444800896, + "learning_rate": 0.0001800400003104436, + "loss": 0.7839, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.4254375929183266, + "learning_rate": 0.0001798843455487629, + "loss": 0.7141, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4931940454951766, + "learning_rate": 0.00017972815405699103, + "loss": 0.7789, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.4705259096844537, + "learning_rate": 0.00017957142688455362, + "loss": 0.8136, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.5208499422721253, + "learning_rate": 0.00017941416508447536, + "loss": 0.7836, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.5676233223845931, + "learning_rate": 0.00017925636971337304, + "loss": 0.7864, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5591655871222605, + "learning_rate": 0.0001790980418314484, + "loss": 0.8028, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.4754102096910578, + "learning_rate": 0.00017893918250248104, + "loss": 0.7925, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5643583330027085, + "learning_rate": 0.00017877979279382135, + "loss": 0.8378, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.47043092011499726, + "learning_rate": 0.00017861987377638312, + "loss": 0.8159, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5322038857940076, + "learning_rate": 0.0001784594265246366, + "loss": 0.826, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.4484512256373065, + "learning_rate": 0.0001782984521166011, + "loss": 0.7877, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.45341619903015334, + "learning_rate": 0.0001781369516338378, + "loss": 0.7785, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.48637062714931634, + "learning_rate": 0.00017797492616144256, + "loss": 0.7895, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.47476787977687446, + "learning_rate": 0.00017781237678803847, + "loss": 0.7731, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.490442660613966, + "learning_rate": 0.00017764930460576866, + "loss": 0.7661, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.48217832291846996, + "learning_rate": 0.000177485710710289, + "loss": 0.7772, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.5718547640020041, + "learning_rate": 0.00017732159620076053, + "loss": 0.7886, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4790260871316992, + "learning_rate": 0.00017715696217984235, + "loss": 0.7743, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.538171029229575, + "learning_rate": 0.00017699180975368396, + "loss": 0.7518, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.47444774727621175, + "learning_rate": 0.00017682614003191807, + "loss": 0.8274, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.4641069586982883, + "learning_rate": 0.00017665995412765285, + "loss": 0.7492, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.39460271353192916, + "learning_rate": 0.00017649325315746478, + "loss": 0.7454, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.49461182872373133, + "learning_rate": 0.00017632603824139085, + "loss": 0.8201, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.47081085668362577, + "learning_rate": 0.0001761583105029213, + "loss": 0.7969, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.4681455790198126, + "learning_rate": 0.0001759900710689918, + "loss": 0.768, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.42370965593642185, + "learning_rate": 0.00017582132106997616, + "loss": 0.8096, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.5735821745386214, + "learning_rate": 0.00017565206163967846, + "loss": 0.7375, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.42007123640118516, + "learning_rate": 0.00017548229391532572, + "loss": 0.8235, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.4465925599641599, + "learning_rate": 0.00017531201903755994, + "loss": 0.766, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.531384270972724, + "learning_rate": 0.00017514123815043074, + "loss": 0.8441, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.424343521475695, + "learning_rate": 0.00017496995240138744, + "loss": 0.8606, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.3595778823729719, + "learning_rate": 0.00017479816294127152, + "loss": 0.7174, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.48050316531217974, + "learning_rate": 0.00017462587092430875, + "loss": 0.8123, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.5248556492574765, + "learning_rate": 0.0001744530775081015, + "loss": 0.8686, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.45602577632397057, + "learning_rate": 0.00017427978385362112, + "loss": 0.783, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.5427847444476832, + "learning_rate": 0.0001741059911251997, + "loss": 0.851, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.5117593054863468, + "learning_rate": 0.0001739317004905227, + "loss": 0.7989, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.45803642501159125, + "learning_rate": 0.000173756913120621, + "loss": 0.7122, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.45691256458822394, + "learning_rate": 0.00017358163018986282, + "loss": 0.7875, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4623278173666505, + "learning_rate": 0.00017340585287594604, + "loss": 0.7427, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.4557973067784474, + "learning_rate": 0.00017322958235989016, + "loss": 0.7717, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4684151122111679, + "learning_rate": 0.0001730528198260285, + "loss": 0.7318, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.5180386857317837, + "learning_rate": 0.00017287556646200018, + "loss": 0.8477, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.5704168958071772, + "learning_rate": 0.00017269782345874203, + "loss": 0.8237, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.48729965053396485, + "learning_rate": 0.00017251959201048083, + "loss": 0.7924, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.46909698585705867, + "learning_rate": 0.00017234087331472497, + "loss": 0.8054, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.40570176445398337, + "learning_rate": 0.00017216166857225674, + "loss": 0.7301, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.43930179100756683, + "learning_rate": 0.00017198197898712404, + "loss": 0.7743, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.45451598783625535, + "learning_rate": 0.00017180180576663228, + "loss": 0.828, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4827073570209465, + "learning_rate": 0.00017162115012133643, + "loss": 0.7993, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.39628655140142455, + "learning_rate": 0.00017144001326503273, + "loss": 0.6848, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.43425422771628786, + "learning_rate": 0.00017125839641475072, + "loss": 0.8164, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.39861016980449127, + "learning_rate": 0.00017107630079074478, + "loss": 0.7698, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.4581271153457384, + "learning_rate": 0.00017089372761648616, + "loss": 0.8106, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.4195059927827672, + "learning_rate": 0.00017071067811865476, + "loss": 0.779, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4117022682648752, + "learning_rate": 0.00017052715352713075, + "loss": 0.6964, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.4440470842584414, + "learning_rate": 0.00017034315507498635, + "loss": 0.7685, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.49439139053437337, + "learning_rate": 0.00017015868399847768, + "loss": 0.8156, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.504797045407952, + "learning_rate": 0.00016997374153703625, + "loss": 0.8041, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.461560523441606, + "learning_rate": 0.00016978832893326074, + "loss": 0.7754, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.4871876444141448, + "learning_rate": 0.00016960244743290868, + "loss": 0.778, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5133143350269246, + "learning_rate": 0.00016941609828488807, + "loss": 0.8086, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.3987091569511243, + "learning_rate": 0.00016922928274124886, + "loss": 0.6926, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.4196731036587334, + "learning_rate": 0.0001690420020571747, + "loss": 0.7249, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.6019163224867493, + "learning_rate": 0.00016885425749097444, + "loss": 0.8312, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4795991675982521, + "learning_rate": 0.0001686660503040737, + "loss": 0.7772, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.4353338870569304, + "learning_rate": 0.00016847738176100632, + "loss": 0.7211, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.46691733568868105, + "learning_rate": 0.00016828825312940592, + "loss": 0.7864, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.3857036487152233, + "learning_rate": 0.0001680986656799975, + "loss": 0.7111, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4495366286474417, + "learning_rate": 0.0001679086206865886, + "loss": 0.759, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.4744694748595945, + "learning_rate": 0.00016771811942606108, + "loss": 0.7717, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4704097676256151, + "learning_rate": 0.00016752716317836229, + "loss": 0.7382, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.36811260781930977, + "learning_rate": 0.00016733575322649657, + "loss": 0.6893, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.4888253364404233, + "learning_rate": 0.0001671438908565167, + "loss": 0.8192, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.4451286475399139, + "learning_rate": 0.00016695157735751513, + "loss": 0.7518, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.5475940460741608, + "learning_rate": 0.00016675881402161536, + "loss": 0.7607, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.40993584937801125, + "learning_rate": 0.0001665656021439633, + "loss": 0.763, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.48824216934054654, + "learning_rate": 0.0001663719430227186, + "loss": 0.7755, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.515099943428024, + "learning_rate": 0.00016617783795904565, + "loss": 0.7924, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.43134057267310605, + "learning_rate": 0.00016598328825710533, + "loss": 0.7754, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.5504098337356608, + "learning_rate": 0.00016578829522404583, + "loss": 0.8429, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4940928471631194, + "learning_rate": 0.000165592860169994, + "loss": 0.7701, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.5078007893178331, + "learning_rate": 0.00016539698440804661, + "loss": 0.8127, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.40809202305330516, + "learning_rate": 0.00016520066925426144, + "loss": 0.7131, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.5267721859621575, + "learning_rate": 0.0001650039160276485, + "loss": 0.7879, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.555623416935034, + "learning_rate": 0.0001648067260501611, + "loss": 0.7464, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.47242108718334574, + "learning_rate": 0.0001646091006466871, + "loss": 0.7482, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.434436157164965, + "learning_rate": 0.0001644110411450398, + "loss": 0.797, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.5143690476812447, + "learning_rate": 0.00016421254887594917, + "loss": 0.7908, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4712395271201534, + "learning_rate": 0.00016401362517305296, + "loss": 0.7755, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.46279760331106856, + "learning_rate": 0.00016381427137288754, + "loss": 0.7637, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4952490838053615, + "learning_rate": 0.00016361448881487914, + "loss": 0.7626, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.4603696659954351, + "learning_rate": 0.0001634142788413346, + "loss": 0.7616, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.45145665775466476, + "learning_rate": 0.00016321364279743266, + "loss": 0.7853, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.4922555531672983, + "learning_rate": 0.00016301258203121462, + "loss": 0.7621, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4220625389408268, + "learning_rate": 0.0001628110978935756, + "loss": 0.7621, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.44844932466206555, + "learning_rate": 0.00016260919173825508, + "loss": 0.7799, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4679186618556549, + "learning_rate": 0.00016240686492182804, + "loss": 0.8217, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.419269058125717, + "learning_rate": 0.00016220411880369601, + "loss": 0.7608, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.41740087340015597, + "learning_rate": 0.00016200095474607753, + "loss": 0.7297, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.45791369382882224, + "learning_rate": 0.00016179737411399926, + "loss": 0.765, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5255076947887414, + "learning_rate": 0.00016159337827528685, + "loss": 0.8014, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.459584359122999, + "learning_rate": 0.00016138896860055555, + "loss": 0.8193, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.47469393989155145, + "learning_rate": 0.0001611841464632011, + "loss": 0.7453, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.4131509273470646, + "learning_rate": 0.00016097891323939062, + "loss": 0.762, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.42581743577782966, + "learning_rate": 0.0001607732703080532, + "loss": 0.7363, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.38516684481175223, + "learning_rate": 0.00016056721905087056, + "loss": 0.7228, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.3836176742214281, + "learning_rate": 0.00016036076085226814, + "loss": 0.6523, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.4915656142232788, + "learning_rate": 0.00016015389709940538, + "loss": 0.7871, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.48349109519241057, + "learning_rate": 0.0001599466291821666, + "loss": 0.7877, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.44163695697766775, + "learning_rate": 0.0001597389584931517, + "loss": 0.7428, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.4328561392531088, + "learning_rate": 0.0001595308864276666, + "loss": 0.7502, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.4938027784803824, + "learning_rate": 0.0001593224143837142, + "loss": 0.7887, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.4297049972537339, + "learning_rate": 0.0001591135437619847, + "loss": 0.8057, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.44007381681857877, + "learning_rate": 0.00015890427596584617, + "loss": 0.7985, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.3917744067596416, + "learning_rate": 0.0001586946124013354, + "loss": 0.7456, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.41200502929810034, + "learning_rate": 0.00015848455447714822, + "loss": 0.7607, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5236098268934606, + "learning_rate": 0.0001582741036046301, + "loss": 0.8249, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.4655752655037792, + "learning_rate": 0.00015806326119776663, + "loss": 0.6992, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4956055625853647, + "learning_rate": 0.00015785202867317407, + "loss": 0.7292, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.45043226529870933, + "learning_rate": 0.00015764040745008988, + "loss": 0.7357, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4192682347287542, + "learning_rate": 0.00015742839895036305, + "loss": 0.8331, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.38132386168633237, + "learning_rate": 0.00015721600459844468, + "loss": 0.6636, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.46353702228952487, + "learning_rate": 0.00015700322582137827, + "loss": 0.7522, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.4144194902717004, + "learning_rate": 0.00015679006404879033, + "loss": 0.7369, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4469177538620975, + "learning_rate": 0.0001565765207128805, + "loss": 0.7435, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.3964842354715512, + "learning_rate": 0.00015636259724841222, + "loss": 0.6633, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4363718416537173, + "learning_rate": 0.0001561482950927029, + "loss": 0.8078, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.42421510982110067, + "learning_rate": 0.00015593361568561428, + "loss": 0.6786, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3736016797441009, + "learning_rate": 0.00015571856046954285, + "loss": 0.689, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.45707318258464946, + "learning_rate": 0.0001555031308894101, + "loss": 0.8534, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5421905187769916, + "learning_rate": 0.00015528732839265272, + "loss": 0.8016, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.5294648573601008, + "learning_rate": 0.0001550711544292131, + "loss": 0.8031, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.5202468416897913, + "learning_rate": 0.0001548546104515294, + "loss": 0.8505, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.43684561909750713, + "learning_rate": 0.00015463769791452574, + "loss": 0.7959, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.41976006424603907, + "learning_rate": 0.00015442041827560274, + "loss": 0.7377, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.3750568280997164, + "learning_rate": 0.00015420277299462736, + "loss": 0.6778, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4844704168166154, + "learning_rate": 0.00015398476353392323, + "loss": 0.7729, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.4573814253014277, + "learning_rate": 0.00015376639135826107, + "loss": 0.7295, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4229278166795163, + "learning_rate": 0.00015354765793484834, + "loss": 0.779, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.39134102147840255, + "learning_rate": 0.00015332856473331978, + "loss": 0.7282, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4018318818965121, + "learning_rate": 0.00015310911322572753, + "loss": 0.7387, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.4525247116066564, + "learning_rate": 0.00015288930488653094, + "loss": 0.7727, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.48012378662271826, + "learning_rate": 0.000152669141192587, + "loss": 0.6921, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.4324585146029964, + "learning_rate": 0.0001524486236231402, + "loss": 0.766, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.40183349656608747, + "learning_rate": 0.00015222775365981273, + "loss": 0.7135, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.4214155735954674, + "learning_rate": 0.00015200653278659432, + "loss": 0.7117, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4663710431593555, + "learning_rate": 0.00015178496248983254, + "loss": 0.7223, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.3905830218453315, + "learning_rate": 0.00015156304425822267, + "loss": 0.6905, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.5105981653821051, + "learning_rate": 0.00015134077958279765, + "loss": 0.7125, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.4410317560984314, + "learning_rate": 0.00015111816995691809, + "loss": 0.7408, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.449784109636782, + "learning_rate": 0.00015089521687626243, + "loss": 0.8165, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.41779437775287087, + "learning_rate": 0.00015067192183881658, + "loss": 0.71, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.4475202549139668, + "learning_rate": 0.000150448286344864, + "loss": 0.7816, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.6204051569206457, + "learning_rate": 0.00015022431189697568, + "loss": 0.8919, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4800658555085623, + "learning_rate": 0.00015000000000000001, + "loss": 0.8167, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.4226223390547385, + "learning_rate": 0.0001497753521610526, + "loss": 0.6846, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4283393298389574, + "learning_rate": 0.00014955036988950618, + "loss": 0.7417, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.43211339102560453, + "learning_rate": 0.00014932505469698052, + "loss": 0.7491, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.49152903026855915, + "learning_rate": 0.00014909940809733222, + "loss": 0.838, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.452265605480497, + "learning_rate": 0.0001488734316066446, + "loss": 0.7209, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.545278818491295, + "learning_rate": 0.00014864712674321734, + "loss": 0.845, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.5764244596882926, + "learning_rate": 0.0001484204950275565, + "loss": 0.8211, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.6010972612369179, + "learning_rate": 0.00014819353798236427, + "loss": 0.8423, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.44105865182591925, + "learning_rate": 0.00014796625713252848, + "loss": 0.6825, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3877950674669648, + "learning_rate": 0.00014773865400511272, + "loss": 0.6835, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.43012622299713454, + "learning_rate": 0.00014751073012934587, + "loss": 0.7166, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4845720628208265, + "learning_rate": 0.00014728248703661182, + "loss": 0.7753, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.4227159679849885, + "learning_rate": 0.0001470539262604393, + "loss": 0.6918, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.42693754032896614, + "learning_rate": 0.00014682504933649144, + "loss": 0.786, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.43350068018540505, + "learning_rate": 0.00014659585780255556, + "loss": 0.6695, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.4539085530411711, + "learning_rate": 0.00014636635319853275, + "loss": 0.7464, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.4076576810042241, + "learning_rate": 0.0001461365370664276, + "loss": 0.7067, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.4377396029011281, + "learning_rate": 0.00014590641095033787, + "loss": 0.7387, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.45352747003340027, + "learning_rate": 0.00014567597639644387, + "loss": 0.8014, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.47126093132665964, + "learning_rate": 0.00014544523495299842, + "loss": 0.7444, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.41395625679635656, + "learning_rate": 0.00014521418817031628, + "loss": 0.7425, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4231225249203355, + "learning_rate": 0.0001449828376007636, + "loss": 0.6971, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.4138814527994516, + "learning_rate": 0.00014475118479874774, + "loss": 0.7182, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.4650691019424587, + "learning_rate": 0.0001445192313207067, + "loss": 0.7749, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.4760664405868015, + "learning_rate": 0.0001442869787250987, + "loss": 0.7578, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.45232863318343103, + "learning_rate": 0.0001440544285723915, + "loss": 0.6736, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.5230469103110574, + "learning_rate": 0.00014382158242505234, + "loss": 0.8259, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.4338464917355991, + "learning_rate": 0.00014358844184753712, + "loss": 0.7681, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.49859848326183653, + "learning_rate": 0.00014335500840627986, + "loss": 0.776, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.44729854713715494, + "learning_rate": 0.00014312128366968243, + "loss": 0.7363, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.4322585615668971, + "learning_rate": 0.0001428872692081038, + "loss": 0.7653, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.41427933628653707, + "learning_rate": 0.00014265296659384956, + "loss": 0.7755, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.46708127193055726, + "learning_rate": 0.00014241837740116132, + "loss": 0.7904, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.41499455660098955, + "learning_rate": 0.00014218350320620624, + "loss": 0.7066, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.5020715750722332, + "learning_rate": 0.00014194834558706632, + "loss": 0.7784, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.5156058012318389, + "learning_rate": 0.0001417129061237278, + "loss": 0.8441, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.4522808025098913, + "learning_rate": 0.0001414771863980707, + "loss": 0.8643, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.3995307603496349, + "learning_rate": 0.00014124118799385796, + "loss": 0.7943, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.4285524294891582, + "learning_rate": 0.00014100491249672498, + "loss": 0.7826, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.523136423212411, + "learning_rate": 0.00014076836149416887, + "loss": 0.7701, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.4391366370665467, + "learning_rate": 0.0001405315365755379, + "loss": 0.7624, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.46959643574814464, + "learning_rate": 0.0001402944393320206, + "loss": 0.781, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.40194940332797746, + "learning_rate": 0.00014005707135663527, + "loss": 0.7015, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4684327943617427, + "learning_rate": 0.00013981943424421932, + "loss": 0.7251, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.39832501600935233, + "learning_rate": 0.00013958152959141825, + "loss": 0.7508, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.44967938370538474, + "learning_rate": 0.00013934335899667527, + "loss": 0.823, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.38964436200631264, + "learning_rate": 0.00013910492406022033, + "loss": 0.6597, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.5875952758046031, + "learning_rate": 0.00013886622638405952, + "loss": 0.8907, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.41508015467363896, + "learning_rate": 0.0001386272675719642, + "loss": 0.7338, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.36590855761782176, + "learning_rate": 0.00013838804922946027, + "loss": 0.7389, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.490121745593381, + "learning_rate": 0.00013814857296381728, + "loss": 0.8102, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.44088560611113053, + "learning_rate": 0.00013790884038403795, + "loss": 0.7914, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.5115681409107602, + "learning_rate": 0.00013766885310084688, + "loss": 0.7538, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.43192003334052165, + "learning_rate": 0.00013742861272668012, + "loss": 0.7648, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.41543866901551685, + "learning_rate": 0.00013718812087567414, + "loss": 0.6623, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.4059960163507851, + "learning_rate": 0.00013694737916365517, + "loss": 0.6941, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.41196465157911266, + "learning_rate": 0.000136706389208128, + "loss": 0.6495, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.4134185282392816, + "learning_rate": 0.00013646515262826552, + "loss": 0.699, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.38557605015027085, + "learning_rate": 0.00013622367104489756, + "loss": 0.6611, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.4095403096672025, + "learning_rate": 0.0001359819460805001, + "loss": 0.721, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.4986737593577314, + "learning_rate": 0.0001357399793591844, + "loss": 0.7324, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.42721250489256873, + "learning_rate": 0.0001354977725066859, + "loss": 0.7572, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.5279985164816671, + "learning_rate": 0.00013525532715035366, + "loss": 0.7984, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.49640263739102203, + "learning_rate": 0.00013501264491913906, + "loss": 0.8373, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.7823408697685369, + "learning_rate": 0.00013476972744358507, + "loss": 0.6345, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4464110012609075, + "learning_rate": 0.0001345265763558152, + "loss": 0.7183, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.4941674866246718, + "learning_rate": 0.00013428319328952253, + "loss": 0.765, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.4212429963438416, + "learning_rate": 0.00013403957987995882, + "loss": 0.8026, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.5015401184238121, + "learning_rate": 0.0001337957377639235, + "loss": 0.7119, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4712569345916714, + "learning_rate": 0.0001335516685797525, + "loss": 0.7601, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.48298983722346495, + "learning_rate": 0.0001333073739673076, + "loss": 0.7468, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.42128236331141106, + "learning_rate": 0.00013306285556796495, + "loss": 0.6988, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.43660086130573456, + "learning_rate": 0.0001328181150246045, + "loss": 0.7, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.403372334702858, + "learning_rate": 0.00013257315398159864, + "loss": 0.6843, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.35725815795608484, + "learning_rate": 0.00013232797408480127, + "loss": 0.687, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.46502382720079394, + "learning_rate": 0.00013208257698153677, + "loss": 0.7597, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.47555465949251546, + "learning_rate": 0.00013183696432058888, + "loss": 0.7433, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.43203790553360294, + "learning_rate": 0.00013159113775218964, + "loss": 0.7289, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.49746515651547685, + "learning_rate": 0.00013134509892800822, + "loss": 0.7373, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.5435992460513512, + "learning_rate": 0.00013109884950114007, + "loss": 0.7064, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.4479501305581582, + "learning_rate": 0.00013085239112609547, + "loss": 0.7408, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.45825882637041954, + "learning_rate": 0.00013060572545878875, + "loss": 0.7366, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.4207867694918564, + "learning_rate": 0.00013035885415652685, + "loss": 0.7701, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4061367178446236, + "learning_rate": 0.00013011177887799845, + "loss": 0.7346, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.5198874812139052, + "learning_rate": 0.00012986450128326266, + "loss": 0.81, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.37697730018063164, + "learning_rate": 0.00012961702303373795, + "loss": 0.7049, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.44775689222934073, + "learning_rate": 0.00012936934579219094, + "loss": 0.7461, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.5033512458411837, + "learning_rate": 0.00012912147122272523, + "loss": 0.7314, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.41579339723003056, + "learning_rate": 0.00012887340099077024, + "loss": 0.6281, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.36809122622614143, + "learning_rate": 0.00012862513676307008, + "loss": 0.6204, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.45194043935373646, + "learning_rate": 0.0001283766802076722, + "loss": 0.7546, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.4288747716764305, + "learning_rate": 0.00012812803299391628, + "loss": 0.6822, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.5903941475612065, + "learning_rate": 0.00012787919679242306, + "loss": 0.8609, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4672594849005474, + "learning_rate": 0.00012763017327508305, + "loss": 0.7698, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.4942648098027479, + "learning_rate": 0.00012738096411504522, + "loss": 0.7369, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.42986225428556174, + "learning_rate": 0.0001271315709867059, + "loss": 0.7064, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.42920038852281717, + "learning_rate": 0.00012688199556569753, + "loss": 0.7679, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.5203513666933222, + "learning_rate": 0.00012663223952887723, + "loss": 0.7905, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.4944381595276974, + "learning_rate": 0.0001263823045543158, + "loss": 0.7235, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.46170223055809334, + "learning_rate": 0.00012613219232128608, + "loss": 0.74, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.5172231671804536, + "learning_rate": 0.00012588190451025207, + "loss": 0.7417, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.38242212830027367, + "learning_rate": 0.00012563144280285741, + "loss": 0.6738, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.6621820445926183, + "learning_rate": 0.00012538080888191408, + "loss": 0.8639, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.5380969136869799, + "learning_rate": 0.00012513000443139112, + "loss": 0.8612, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.4413616622505353, + "learning_rate": 0.00012487903113640337, + "loss": 0.7477, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3926736956223621, + "learning_rate": 0.00012462789068320017, + "loss": 0.6948, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.5991568297488339, + "learning_rate": 0.00012437658475915377, + "loss": 0.9187, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.39312025261804895, + "learning_rate": 0.00012412511505274844, + "loss": 0.7766, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.3487157010010168, + "learning_rate": 0.00012387348325356874, + "loss": 0.703, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.43079790932596457, + "learning_rate": 0.00012362169105228826, + "loss": 0.7772, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.4121308140189897, + "learning_rate": 0.00012336974014065844, + "loss": 0.823, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.38090987839878365, + "learning_rate": 0.000123117632211497, + "loss": 0.7192, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.7962407192316323, + "learning_rate": 0.00012286536895867654, + "loss": 0.8027, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4707226388623328, + "learning_rate": 0.00012261295207711346, + "loss": 0.7552, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.42748653530505404, + "learning_rate": 0.00012236038326275626, + "loss": 0.7287, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4088509360914169, + "learning_rate": 0.0001221076642125742, + "loss": 0.7127, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.47058586234288113, + "learning_rate": 0.00012185479662454595, + "loss": 0.7746, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.5021539526911709, + "learning_rate": 0.00012160178219764837, + "loss": 0.766, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.42619414718966814, + "learning_rate": 0.00012134862263184467, + "loss": 0.7145, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3736131505311299, + "learning_rate": 0.00012109531962807332, + "loss": 0.7253, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.4199445019074886, + "learning_rate": 0.00012084187488823657, + "loss": 0.7234, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4080505134188845, + "learning_rate": 0.00012058829011518896, + "loss": 0.7694, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.38891966675084405, + "learning_rate": 0.00012033456701272576, + "loss": 0.7481, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.5546274481813345, + "learning_rate": 0.00012008070728557186, + "loss": 0.7072, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.3842808967932505, + "learning_rate": 0.00011982671263936995, + "loss": 0.6833, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.5323218413319508, + "learning_rate": 0.00011957258478066931, + "loss": 0.7154, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.5229109514986995, + "learning_rate": 0.00011931832541691418, + "loss": 0.7449, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.4643395245494808, + "learning_rate": 0.00011906393625643244, + "loss": 0.6746, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.4429720701897638, + "learning_rate": 0.00011880941900842397, + "loss": 0.7769, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4326668366153276, + "learning_rate": 0.00011855477538294935, + "loss": 0.6756, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.4849046251181642, + "learning_rate": 0.00011830000709091815, + "loss": 0.7104, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.41564972787398957, + "learning_rate": 0.00011804511584407763, + "loss": 0.7073, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.4493781091924018, + "learning_rate": 0.0001177901033550012, + "loss": 0.7065, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.42388302507437037, + "learning_rate": 0.00011753497133707679, + "loss": 0.6564, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.4923716589001938, + "learning_rate": 0.00011727972150449544, + "loss": 0.7741, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4178327260273074, + "learning_rate": 0.00011702435557223987, + "loss": 0.7493, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.3762637913831736, + "learning_rate": 0.00011676887525607271, + "loss": 0.7049, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.3969977833164792, + "learning_rate": 0.00011651328227252517, + "loss": 0.7313, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.4335735319780596, + "learning_rate": 0.00011625757833888551, + "loss": 0.7144, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4138327453913176, + "learning_rate": 0.00011600176517318741, + "loss": 0.77, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.5047893118657163, + "learning_rate": 0.0001157458444941984, + "loss": 0.7011, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3950191841703804, + "learning_rate": 0.00011548981802140848, + "loss": 0.7883, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.44424528822131326, + "learning_rate": 0.00011523368747501839, + "loss": 0.8441, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.6385143117966893, + "learning_rate": 0.00011497745457592816, + "loss": 0.7202, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.46516466356510056, + "learning_rate": 0.00011472112104572547, + "loss": 0.6778, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4395435007469664, + "learning_rate": 0.00011446468860667421, + "loss": 0.7256, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.43429061744253084, + "learning_rate": 0.0001142081589817027, + "loss": 0.6587, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.4498098946248189, + "learning_rate": 0.00011395153389439233, + "loss": 0.7129, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.41908385366486545, + "learning_rate": 0.00011369481506896582, + "loss": 0.7348, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.40052577581532633, + "learning_rate": 0.00011343800423027582, + "loss": 0.7171, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.4008782072503879, + "learning_rate": 0.00011318110310379301, + "loss": 0.6499, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.39949035280289624, + "learning_rate": 0.0001129241134155949, + "loss": 0.7325, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.4338227097132858, + "learning_rate": 0.00011266703689235394, + "loss": 0.68, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4792328739644269, + "learning_rate": 0.00011240987526132594, + "loss": 0.6854, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.5433559597110078, + "learning_rate": 0.00011215263025033869, + "loss": 0.7641, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.6008664840716792, + "learning_rate": 0.00011189530358778005, + "loss": 0.848, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.5521876730877993, + "learning_rate": 0.00011163789700258655, + "loss": 0.7713, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.46207986616914726, + "learning_rate": 0.00011138041222423177, + "loss": 0.7989, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.43605957294049125, + "learning_rate": 0.00011112285098271451, + "loss": 0.7604, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.4622362897937555, + "learning_rate": 0.00011086521500854745, + "loss": 0.7442, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.4718725513449837, + "learning_rate": 0.00011060750603274535, + "loss": 0.747, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3847387882100169, + "learning_rate": 0.00011034972578681338, + "loss": 0.6951, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.558777399168513, + "learning_rate": 0.00011009187600273566, + "loss": 0.7929, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.46752261709614024, + "learning_rate": 0.00010983395841296348, + "loss": 0.8129, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.4249726636385701, + "learning_rate": 0.00010957597475040373, + "loss": 0.7479, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5005423717044616, + "learning_rate": 0.00010931792674840718, + "loss": 0.7445, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.4861351628893515, + "learning_rate": 0.00010905981614075693, + "loss": 0.7791, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.5639604973250558, + "learning_rate": 0.00010880164466165674, + "loss": 0.8149, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.45354479874298465, + "learning_rate": 0.00010854341404571928, + "loss": 0.7934, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4628445069619903, + "learning_rate": 0.00010828512602795462, + "loss": 0.7374, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.5195955341225196, + "learning_rate": 0.00010802678234375851, + "loss": 0.8352, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.43595476728041826, + "learning_rate": 0.00010776838472890065, + "loss": 0.7256, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.3606912458601585, + "learning_rate": 0.0001075099349195131, + "loss": 0.6895, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4643672351016709, + "learning_rate": 0.00010725143465207867, + "loss": 0.7274, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.5095297400473262, + "learning_rate": 0.00010699288566341914, + "loss": 0.779, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3910073850309763, + "learning_rate": 0.00010673428969068364, + "loss": 0.7128, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.4219109386141692, + "learning_rate": 0.000106475648471337, + "loss": 0.7065, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.4997152339765753, + "learning_rate": 0.00010621696374314807, + "loss": 0.7847, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.5290062282289308, + "learning_rate": 0.00010595823724417795, + "loss": 0.8088, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.4437823849088452, + "learning_rate": 0.00010569947071276847, + "loss": 0.7569, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.48287565653761594, + "learning_rate": 0.00010544066588753044, + "loss": 0.7397, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.47399955722247405, + "learning_rate": 0.00010518182450733186, + "loss": 0.7304, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.48212727147786694, + "learning_rate": 0.00010492294831128641, + "loss": 0.7857, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4081587531539875, + "learning_rate": 0.00010466403903874176, + "loss": 0.7182, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.38747498955693305, + "learning_rate": 0.00010440509842926767, + "loss": 0.6738, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4781253938986905, + "learning_rate": 0.00010414612822264455, + "loss": 0.8037, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.44129600676019193, + "learning_rate": 0.00010388713015885161, + "loss": 0.7042, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.42560708559719823, + "learning_rate": 0.00010362810597805526, + "loss": 0.7282, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.4477193550610481, + "learning_rate": 0.00010336905742059742, + "loss": 0.7277, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.5024188974022757, + "learning_rate": 0.0001031099862269837, + "loss": 0.7614, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.4616493200347517, + "learning_rate": 0.0001028508941378719, + "loss": 0.7196, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.44716244620334705, + "learning_rate": 0.00010259178289406011, + "loss": 0.7382, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.5509178451941221, + "learning_rate": 0.00010233265423647523, + "loss": 0.8428, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4648368930268642, + "learning_rate": 0.00010207350990616107, + "loss": 0.7804, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.41769829538371145, + "learning_rate": 0.00010181435164426676, + "loss": 0.6763, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.49249667084061416, + "learning_rate": 0.0001015551811920351, + "loss": 0.6776, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.4368224988064386, + "learning_rate": 0.00010129600029079072, + "loss": 0.7158, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.6091638158209373, + "learning_rate": 0.00010103681068192845, + "loss": 0.7423, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.525020322096225, + "learning_rate": 0.00010077761410690172, + "loss": 0.7972, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.5862462392458725, + "learning_rate": 0.00010051841230721065, + "loss": 0.8035, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.5150342090196197, + "learning_rate": 0.00010025920702439051, + "loss": 0.7262, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4827917839076806, + "learning_rate": 0.0001, + "loss": 0.687, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.49756591436412745, + "learning_rate": 9.97407929756095e-05, + "loss": 0.7972, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.6491106991630037, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7236, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.45070946662545036, + "learning_rate": 9.92223858930983e-05, + "loss": 0.733, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4404908086167703, + "learning_rate": 9.896318931807155e-05, + "loss": 0.676, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.42822642977967657, + "learning_rate": 9.870399970920932e-05, + "loss": 0.6744, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.42878976231733074, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7756, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.41192836438899527, + "learning_rate": 9.818564835573323e-05, + "loss": 0.7018, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3886159645768922, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6888, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.42732623258735747, + "learning_rate": 9.766734576352478e-05, + "loss": 0.7341, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.5206134048709076, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7495, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.4736546649795084, + "learning_rate": 9.714910586212816e-05, + "loss": 0.761, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4325929971249264, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6619, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.5365900497916197, + "learning_rate": 9.663094257940258e-05, + "loss": 0.7366, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4769972496588147, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7723, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.43283977877619345, + "learning_rate": 9.611286984114841e-05, + "loss": 0.7172, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.4838519468109514, + "learning_rate": 9.585387177735547e-05, + "loss": 0.8092, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.5482584578563439, + "learning_rate": 9.559490157073236e-05, + "loss": 0.7234, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3719346143111832, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6572, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.4558970091769671, + "learning_rate": 9.507705168871358e-05, + "loss": 0.7157, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.48361966098232073, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6677, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.5467192328851579, + "learning_rate": 9.455933411246958e-05, + "loss": 0.7161, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.47218982542645954, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7334, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.44148771569782885, + "learning_rate": 9.404176275582208e-05, + "loss": 0.7564, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.39846129196993135, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6859, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.4944897786007179, + "learning_rate": 9.352435152866298e-05, + "loss": 0.699, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.5018041502566262, + "learning_rate": 9.326571030931637e-05, + "loss": 0.9113, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.6628582201931914, + "learning_rate": 9.300711433658087e-05, + "loss": 0.8114, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4295285155624496, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7586, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.4294367293079005, + "learning_rate": 9.249006508048694e-05, + "loss": 0.6762, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.49039532092187726, + "learning_rate": 9.223161527109937e-05, + "loss": 0.674, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.36857242851828004, + "learning_rate": 9.197321765624152e-05, + "loss": 0.628, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4879511571759796, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7196, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.4271865794426687, + "learning_rate": 9.145658595428074e-05, + "loss": 0.6725, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.46956215263281625, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7399, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.43799030257027066, + "learning_rate": 9.09401838592431e-05, + "loss": 0.7, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.5321456602261782, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7986, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.4675180839233065, + "learning_rate": 9.04240252495963e-05, + "loss": 0.7248, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.5032933922334141, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7039, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.45260383572623736, + "learning_rate": 8.990812399726435e-05, + "loss": 0.6801, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.42635949104610027, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7046, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.49559611472926934, + "learning_rate": 8.939249396725467e-05, + "loss": 0.7455, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4777232582339619, + "learning_rate": 8.913478499145254e-05, + "loss": 0.8166, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.5557898093105165, + "learning_rate": 8.887714901728551e-05, + "loss": 0.741, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.48456383759255994, + "learning_rate": 8.861958777576827e-05, + "loss": 0.8253, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.4095521800669326, + "learning_rate": 8.836210299741346e-05, + "loss": 0.6651, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.42270261837701667, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7433, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.4638294362795246, + "learning_rate": 8.784736974966135e-05, + "loss": 0.8163, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4935870098713025, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7585, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.45068959511196954, + "learning_rate": 8.733296310764611e-05, + "loss": 0.7753, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.4031024404777808, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6753, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.4236257907655874, + "learning_rate": 8.6818896896207e-05, + "loss": 0.764, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4423818785729849, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7659, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.36826114157455997, + "learning_rate": 8.63051849310342e-05, + "loss": 0.7201, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.4490013780803883, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7866, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.42498937946884835, + "learning_rate": 8.579184101829734e-05, + "loss": 0.7276, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.4558168107297979, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6715, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.44808068761867476, + "learning_rate": 8.527887895427454e-05, + "loss": 0.7468, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.5426271223678154, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6919, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.41706166479314355, + "learning_rate": 8.476631252498162e-05, + "loss": 0.6811, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.449909836051013, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7287, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.40904128257034156, + "learning_rate": 8.425415550580162e-05, + "loss": 0.6821, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.5514079567161478, + "learning_rate": 8.399823482681262e-05, + "loss": 0.7014, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.5621941021255352, + "learning_rate": 8.374242166111448e-05, + "loss": 0.714, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.48922281319432914, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6964, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.40495064573395556, + "learning_rate": 8.323112474392731e-05, + "loss": 0.6596, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.37669628443541675, + "learning_rate": 8.297564442776014e-05, + "loss": 0.679, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.38212624787048943, + "learning_rate": 8.272027849550457e-05, + "loss": 0.7044, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4164961167082215, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7001, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.5806986076006299, + "learning_rate": 8.220989664499878e-05, + "loss": 0.7471, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4128919453980456, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6802, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.4699849720330913, + "learning_rate": 8.169999290908188e-05, + "loss": 0.7399, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5687699333630892, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7319, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.41632514043834745, + "learning_rate": 8.119058099157604e-05, + "loss": 0.6833, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.48479864488960833, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7826, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.48285527273343337, + "learning_rate": 8.068167458308582e-05, + "loss": 0.7422, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.43286802937523056, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7611, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.4075535963903668, + "learning_rate": 8.017328736063006e-05, + "loss": 0.6694, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.3753612060340906, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6854, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.3843107108459075, + "learning_rate": 7.966543298727425e-05, + "loss": 0.6427, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4233515369127332, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7574, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.6060253045716254, + "learning_rate": 7.915812511176347e-05, + "loss": 0.7259, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.43464604597808115, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7254, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.46273691599511046, + "learning_rate": 7.865137736815535e-05, + "loss": 0.6386, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4274034364439089, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6916, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.4377273291277029, + "learning_rate": 7.814520337545406e-05, + "loss": 0.7152, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.49562762295599727, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7728, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.4119996776096918, + "learning_rate": 7.763961673724379e-05, + "loss": 0.6834, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4360591441700369, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6787, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.5539604553956203, + "learning_rate": 7.713463104132345e-05, + "loss": 0.7181, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.4987585680019293, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7772, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.4818796800652592, + "learning_rate": 7.663025985934158e-05, + "loss": 0.7785, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.38384019808049874, + "learning_rate": 7.637830894771175e-05, + "loss": 0.648, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.5149515623933224, + "learning_rate": 7.61265167464313e-05, + "loss": 0.7902, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.46017946680628197, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6885, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.3825756824730468, + "learning_rate": 7.562341524084623e-05, + "loss": 0.6916, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.44635582841279664, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7616, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.43836625715361827, + "learning_rate": 7.512096886359664e-05, + "loss": 0.7535, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.39445380579688616, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7074, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.47492623940823814, + "learning_rate": 7.461919111808595e-05, + "loss": 0.6366, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4140802668973822, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6884, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.43969996365262, + "learning_rate": 7.411809548974792e-05, + "loss": 0.7218, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4315000246809719, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6829, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.38695555281571414, + "learning_rate": 7.361769544568425e-05, + "loss": 0.6766, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.45378683454472246, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7154, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.5025158375202976, + "learning_rate": 7.311800443430251e-05, + "loss": 0.8012, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.5368880369743676, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7461, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.4469626531765936, + "learning_rate": 7.26190358849548e-05, + "loss": 0.6611, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3904067865100921, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6571, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.5395588619367233, + "learning_rate": 7.212080320757695e-05, + "loss": 0.7605, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4063505434642953, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7216, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.439733970220297, + "learning_rate": 7.162331979232783e-05, + "loss": 0.7483, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3903217297594714, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6226, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.35791207602972686, + "learning_rate": 7.112659900922976e-05, + "loss": 0.675, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.46305859768985486, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7425, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.3819436794869962, + "learning_rate": 7.06306542078091e-05, + "loss": 0.6778, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.46174364856153316, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7653, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.4073259287429649, + "learning_rate": 7.013549871673736e-05, + "loss": 0.6721, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.5036835056625664, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6868, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.4921489753535104, + "learning_rate": 6.964114584347316e-05, + "loss": 0.6549, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.42186934467470866, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6843, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.5145851167911595, + "learning_rate": 6.914760887390452e-05, + "loss": 0.7688, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.44355951530594984, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6989, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.46686407305336036, + "learning_rate": 6.865490107199181e-05, + "loss": 0.6879, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.46822811242944595, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7705, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.5005001982499331, + "learning_rate": 6.816303567941112e-05, + "loss": 0.7039, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.6557598549144436, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7628, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.5907766808878118, + "learning_rate": 6.767202591519875e-05, + "loss": 0.7857, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3754334115771533, + "learning_rate": 6.742684601840141e-05, + "loss": 0.6747, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.46714188110580834, + "learning_rate": 6.718188497539554e-05, + "loss": 0.6875, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.46643998644250145, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6485, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.4962242868389372, + "learning_rate": 6.669262603269246e-05, + "loss": 0.7604, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4856780772069199, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7647, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.40212979381388003, + "learning_rate": 6.620426223607654e-05, + "loss": 0.6808, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.4092513557560957, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7228, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.468504700291808, + "learning_rate": 6.571680671047749e-05, + "loss": 0.7467, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.48832366184278075, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7542, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.45914601401061483, + "learning_rate": 6.523027255641493e-05, + "loss": 0.7258, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4310698149491041, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6253, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.4269385113862225, + "learning_rate": 6.474467284964634e-05, + "loss": 0.7287, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5374205730161659, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7529, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.4238875672676686, + "learning_rate": 6.426002064081565e-05, + "loss": 0.7317, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4080467891819177, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7255, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.4796802337184242, + "learning_rate": 6.377632895510248e-05, + "loss": 0.6419, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4398667233380474, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6051, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.4571763101900469, + "learning_rate": 6.329361079187199e-05, + "loss": 0.6306, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.44610926768714976, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6575, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.39803843376689096, + "learning_rate": 6.281187912432587e-05, + "loss": 0.6636, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4044923123399956, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6976, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.41279920205145776, + "learning_rate": 6.233114689915316e-05, + "loss": 0.7651, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.49797839499652, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7738, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.4941980334443274, + "learning_rate": 6.18514270361827e-05, + "loss": 0.6992, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4013061255393633, + "learning_rate": 6.161195077053976e-05, + "loss": 0.684, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.5030148764960346, + "learning_rate": 6.13727324280358e-05, + "loss": 0.7332, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4256492278337513, + "learning_rate": 6.113377361594049e-05, + "loss": 0.594, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.5368193272908817, + "learning_rate": 6.08950759397797e-05, + "loss": 0.7029, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.4462677045957923, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6916, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.425300862246148, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.7489, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.5005085207709417, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7802, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.3752762320754259, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.7251, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4425702481945523, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6513, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.4299899094756773, + "learning_rate": 5.946846342446214e-05, + "loss": 0.6924, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.4226631171202636, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7055, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.4246977710889074, + "learning_rate": 5.899508750327501e-05, + "loss": 0.6853, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.480018168315808, + "learning_rate": 5.875881200614207e-05, + "loss": 0.7548, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.41398085518741246, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.6821, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.3877776812865788, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6502, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.39052380092941585, + "learning_rate": 5.80516544129337e-05, + "loss": 0.7182, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.44351355052746444, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6959, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.42646815820261946, + "learning_rate": 5.758162259883867e-05, + "loss": 0.6902, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.4228695561254, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6941, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.4100351559141488, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.6686, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.42136708533291056, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6992, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.42797444786900646, + "learning_rate": 5.664499159372017e-05, + "loss": 0.6868, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.4908314945911969, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7185, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.38129021246302564, + "learning_rate": 5.617841757494762e-05, + "loss": 0.6333, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.4654414447152424, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7339, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.5224291883635922, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.6882, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.42876193303301113, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7271, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.416216982433621, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6564, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4282729312446709, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6968, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.44441536086740685, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.6945, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4564075558753912, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7581, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.4637977268593527, + "learning_rate": 5.432402360355615e-05, + "loss": 0.7087, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4235543963213367, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6951, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.42142190752970404, + "learning_rate": 5.386346293357242e-05, + "loss": 0.7701, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.4605267541051739, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6899, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.44996033056969564, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.6572, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4648115272389889, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7327, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.42083961567449996, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.6481, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3678257321600699, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6567, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.41713246313477464, + "learning_rate": 5.248926987065417e-05, + "loss": 0.6805, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.37052743732415666, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6812, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.4344230327391137, + "learning_rate": 5.203374286747158e-05, + "loss": 0.6924, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.44582569093875274, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7686, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.4199989527857804, + "learning_rate": 5.15795049724435e-05, + "loss": 0.6835, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.40433614049645716, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6678, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.49111083242813114, + "learning_rate": 5.112656839335543e-05, + "loss": 0.7783, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.40452113487742064, + "learning_rate": 5.090059190266779e-05, + "loss": 0.699, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.4185998946571293, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.6536, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.42129377379980115, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6622, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.4778900709802409, + "learning_rate": 5.022464783894744e-05, + "loss": 0.723, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.47538770537483455, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7077, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.46213185401583895, + "learning_rate": 4.977568810302432e-05, + "loss": 0.7447, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.46342637710809953, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6423, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.4603269264208612, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.7205, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.3958040158831917, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7042, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.4394189681367613, + "learning_rate": 4.88818300430819e-05, + "loss": 0.7338, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.38026300704001265, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6468, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.4585870632104486, + "learning_rate": 4.843695574177737e-05, + "loss": 0.7556, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.5708905518795303, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6803, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.44859739726686104, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.6862, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.45353344720099065, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6213, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.4514540013654558, + "learning_rate": 4.755137637685979e-05, + "loss": 0.7197, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.43520247996323425, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6859, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.44573263487448433, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.7486, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.5369980225986257, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7221, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.4684742349759021, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.704, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.4247240492781613, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6429, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.4352750428153809, + "learning_rate": 4.623360864173893e-05, + "loss": 0.6176, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3745093117472935, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6385, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.37654844380452335, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6749, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3795730674439509, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6537, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.5229481781582966, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.694, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3632109076521302, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6259, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.41367182887741916, + "learning_rate": 4.492884557078688e-05, + "loss": 0.6311, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.4537464387618468, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7032, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.4741123520579128, + "learning_rate": 4.449686911058992e-05, + "loss": 0.7449, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4514703347255224, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6484, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.39202622885904026, + "learning_rate": 4.406638431438576e-05, + "loss": 0.6691, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.47745896342818134, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7081, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.434060322789685, + "learning_rate": 4.36374027515878e-05, + "loss": 0.6934, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.527185929501206, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6384, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.38250138247721094, + "learning_rate": 4.320993595120969e-05, + "loss": 0.6515, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.40168446920004564, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6932, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.41208113755503484, + "learning_rate": 4.278399540155536e-05, + "loss": 0.682, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.7080535876026216, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6448, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.4556640559810192, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.697, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.44159196719908, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.68, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.3754475837443555, + "learning_rate": 4.193673880223339e-05, + "loss": 0.6864, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.40973853079506767, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6169, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.40982493587788504, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.6973, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.39166629221274496, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6458, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.4396057241312862, + "learning_rate": 4.109572403415386e-05, + "loss": 0.6047, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.45675566700922193, + "learning_rate": 4.088645623801534e-05, + "loss": 0.676, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.3919823955872468, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.6901, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.4447866341329715, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7109, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.38144466911816505, + "learning_rate": 4.026104150684835e-05, + "loss": 0.6441, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5572834420081134, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6963, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.3841106808846123, + "learning_rate": 3.984610290059467e-05, + "loss": 0.6573, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.42414157313789447, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6725, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.42756170338582344, + "learning_rate": 3.943278094912946e-05, + "loss": 0.6926, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4355363485737852, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7292, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.3934071933253508, + "learning_rate": 3.902108676060937e-05, + "loss": 0.6016, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.5496966401262666, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7274, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.3790492810481393, + "learning_rate": 3.861103139944449e-05, + "loss": 0.6842, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.4401342359507008, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6831, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.3971144677232109, + "learning_rate": 3.820262588600074e-05, + "loss": 0.6354, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.39471198945480024, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6892, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.3987529330890745, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.6804, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5701865245980139, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7172, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.5008429174971117, + "learning_rate": 3.739080826174498e-05, + "loss": 0.6922, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4902919216380368, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6681, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.3357650120743114, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.6205, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4600035146676257, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7735, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.4402983412029306, + "learning_rate": 3.658572115866541e-05, + "loss": 0.7874, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.4187051978636392, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6772, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.41472219827114154, + "learning_rate": 3.618572862711247e-05, + "loss": 0.6537, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.46571200032117377, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7351, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.4768836196404258, + "learning_rate": 3.578745112405083e-05, + "loss": 0.6293, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3916949256445519, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6879, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.4375340971358518, + "learning_rate": 3.539089935331294e-05, + "loss": 0.7221, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.40701462045279024, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6553, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.7622096756879763, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.7598, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4453193853925462, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6814, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.3594708239161031, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.6828, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.49749631565866714, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7251, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.45747003096606015, + "learning_rate": 3.421170477595419e-05, + "loss": 0.7191, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4000803837289144, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7036, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.4446653500117122, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.6766, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4210251334594636, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6718, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.4470565411188531, + "learning_rate": 3.34343978560367e-05, + "loss": 0.734, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.45021310458295094, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7136, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.5674513660698329, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.6502, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.44632307726456727, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6521, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.4798937095611223, + "learning_rate": 3.266424677350346e-05, + "loss": 0.687, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.45454249364117827, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6255, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.461447759914991, + "learning_rate": 3.228188057393895e-05, + "loss": 0.7076, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4068559853634475, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6594, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.3704032851078179, + "learning_rate": 3.190133432000252e-05, + "loss": 0.6398, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.491308392611432, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7316, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.5178312919433019, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.7616, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.41611180745604653, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6684, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.49009980647577833, + "learning_rate": 3.114574250902558e-05, + "loss": 0.7914, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.49550826102102696, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7058, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.5778361024572423, + "learning_rate": 3.077071725875116e-05, + "loss": 0.78, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.4231097330482353, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6936, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.390916302418808, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.6755, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.3766208095255213, + "learning_rate": 3.021167106673928e-05, + "loss": 0.64, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.44799919275752687, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.7635, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.38477675195872846, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.681, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.42836343972350627, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.6173, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4387436284079448, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6526, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.44143701497068183, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7512, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.402519873568703, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6481, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.386754466514997, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.6381, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.4186412142799634, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7032, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.45146307527320995, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.7515, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4417643283832573, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.698, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.3444521337309674, + "learning_rate": 2.819819423336775e-05, + "loss": 0.6701, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.42824992941627515, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6756, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.450921051363528, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.7463, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3632777176469063, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6536, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.3222506999183127, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.5806, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.40924533769273447, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6717, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.37597076668253954, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6468, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.4058231666861766, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6655, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.4341008530986436, + "learning_rate": 2.677041764010988e-05, + "loss": 0.6763, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.4859754406120522, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6578, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.37689948007678203, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.6997, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.40394761311539573, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6621, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.40040126144967014, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.6598, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.48835841579438144, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6989, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.4305382837026107, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.6461, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4278732219711235, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6726, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.3689128305355096, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.6815, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.3759714721714428, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6556, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.4067960037837653, + "learning_rate": 2.503004759861258e-05, + "loss": 0.6577, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4528349081101457, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6634, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.46024702381528826, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.6808, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3950513918741196, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6331, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.37157574354329653, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.6373, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.46104339234263786, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6874, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.34276429191209973, + "learning_rate": 2.400992893100822e-05, + "loss": 0.6057, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4371487382126511, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6717, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.4265062723224766, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.6898, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.5339472722709265, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7031, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.42599256174757205, + "learning_rate": 2.334004587234717e-05, + "loss": 0.6726, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5090136399906379, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.766, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.35801248358183085, + "learning_rate": 2.300819024631603e-05, + "loss": 0.7139, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4168201896132661, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6341, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.4369302146936795, + "learning_rate": 2.26784037992395e-05, + "loss": 0.6986, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3672019449783104, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6283, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.3900868830211544, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.6306, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.40015522687040905, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7209, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.4474832481268081, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.715, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.42044492288355934, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6581, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.45455284502168075, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.7389, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3559518422054641, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.5842, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.4756430447722567, + "learning_rate": 2.138012622361689e-05, + "loss": 0.7225, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.41960456820551933, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6515, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.4669997831410201, + "learning_rate": 2.106081749751897e-05, + "loss": 0.6711, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4570025939597547, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7142, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.4053240160707213, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.6649, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.38111176288776105, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6833, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.4234102077802766, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.6516, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.3765791652978205, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6913, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.551148258949945, + "learning_rate": 2.011565445123711e-05, + "loss": 0.7425, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.42329375710806877, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6798, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.3654041576107259, + "learning_rate": 1.980488270378612e-05, + "loss": 0.5991, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5889008070953602, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7691, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.5321320962028306, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.656, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 1.3128728693670322, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7597, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.551359241348889, + "learning_rate": 1.918981330958678e-05, + "loss": 0.7612, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.418425271349496, + "learning_rate": 1.903740076395151e-05, + "loss": 0.669, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.48724011845090404, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.7145, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.4338801306917334, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7357, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.4094866422544633, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.6743, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.38638552613338145, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6742, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.45972260958685873, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.6301, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.41606548603580784, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.5795, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.3507900711228258, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.6413, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.44622918397105277, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6685, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.5112647107030255, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.7931, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.4117469967232782, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6661, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.4413963948413441, + "learning_rate": 1.739698775823442e-05, + "loss": 0.615, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.4945503410986367, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7369, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.47458541759364137, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.6466, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.42592905361928207, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6743, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.4214118291708083, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.6624, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3744677763502848, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6638, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.4918867346221103, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.7477, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4085862430540114, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6843, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.44248196572285126, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.6931, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.396506854793697, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6379, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.45465112135282115, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.6958, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.38704113003396995, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6362, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.41941980015963803, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.6779, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3819239255711896, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6414, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.35946769399800094, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.6134, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.412022695130824, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6416, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.43726913529127953, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.6597, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.41608067811211547, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6861, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.3956515683990938, + "learning_rate": 1.485810737340767e-05, + "loss": 0.6173, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.43835608143316607, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6351, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.36496866124657096, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.6065, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.42353022496668236, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.7297, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.3976815646227283, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.6252, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.46867159505574235, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7288, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.4115098679614629, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.6694, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.47141463490280416, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.794, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.4073181131247274, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.6754, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4435975572514238, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6207, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.4405035884942258, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.6805, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.43165585906812914, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6287, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.48538558114212177, + "learning_rate": 1.326814704364262e-05, + "loss": 0.7143, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.42789275010803174, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6834, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.4072666150542458, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.6485, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3962410302880435, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6406, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.3436008649179981, + "learning_rate": 1.275673273546758e-05, + "loss": 0.6453, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.39717262188667135, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6477, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.37569259361888674, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.6439, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.40099423473174634, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6668, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.3804594028424689, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.6666, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.58156202113261, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7181, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.37780741332473505, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.6722, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.4567096825147793, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7099, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.4050563966140698, + "learning_rate": 1.176209418012495e-05, + "loss": 0.6702, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4467308030979363, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6345, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.6440589873431481, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.6698, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4877697380021303, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.72, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.3703836395380511, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.5958, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.5325718261370398, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6877, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.38565613271694577, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.6124, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.45076272537905554, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6771, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.4467801054468768, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.7325, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.36868814232178826, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6423, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.4487373063261531, + "learning_rate": 1.057219974130903e-05, + "loss": 0.6863, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.4769150584728038, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6635, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.4736670630391326, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.7362, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.40655879309727255, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6957, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.4812210423869589, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.7792, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4868531913578826, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7368, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.4336837670380572, + "learning_rate": 9.887052838721322e-06, + "loss": 0.6543, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4392646108534538, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7096, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.44141586332695404, + "learning_rate": 9.663506046162985e-06, + "loss": 0.6493, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.46424690641874294, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6527, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.4422767467200843, + "learning_rate": 9.44238707511862e-06, + "loss": 0.6429, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.46743448263495774, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7272, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.48685890683770405, + "learning_rate": 9.22370186822965e-06, + "loss": 0.7151, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.4377172217417621, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6824, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.4830596747463407, + "learning_rate": 9.0074563027294e-06, + "loss": 0.6788, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.38925973276029074, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6471, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.3961447377270664, + "learning_rate": 8.79365619028507e-06, + "loss": 0.6551, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.48245056779683043, + "learning_rate": 8.687674977138116e-06, + "loss": 0.73, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.444345919658435, + "learning_rate": 8.582307276841462e-06, + "loss": 0.6838, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.4079019102628482, + "learning_rate": 8.47755379734373e-06, + "loss": 0.711, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.47973700172631495, + "learning_rate": 8.37341524246672e-06, + "loss": 0.7308, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.36030570169209764, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6871, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.45583825575182646, + "learning_rate": 8.166985701199582e-06, + "loss": 0.6205, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.4569274995213183, + "learning_rate": 8.064696101776358e-06, + "loss": 0.702, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.41678505062448895, + "learning_rate": 7.963024200898462e-06, + "loss": 0.7135, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.47719600813805696, + "learning_rate": 7.861970681683051e-06, + "loss": 0.7628, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.35211680639410153, + "learning_rate": 7.761536223092458e-06, + "loss": 0.6208, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.41121821181807183, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6527, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.3854040359397552, + "learning_rate": 7.562527182833978e-06, + "loss": 0.6421, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.4685780953190182, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7006, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.42627774566843196, + "learning_rate": 7.366002428553153e-06, + "loss": 0.6251, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.4376010907122848, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.7067, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.36396613930614274, + "learning_rate": 7.171967241914224e-06, + "loss": 0.5501, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.39854325327008483, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6593, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.4006920610890835, + "learning_rate": 6.980426837673437e-06, + "loss": 0.5991, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4804281957029272, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.7043, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.4157677545297881, + "learning_rate": 6.791386363539065e-06, + "loss": 0.6422, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3909471204538716, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.7383, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.5182573773191494, + "learning_rate": 6.604850900032955e-06, + "loss": 0.6585, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.4396361114163665, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6524, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.43671327334515353, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6736, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4470597713138655, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6371, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.6447410630483885, + "learning_rate": 6.239314990243339e-06, + "loss": 0.6985, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3784539577065875, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6113, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.4741807533832942, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.7192, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.40696168117855497, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6546, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.46233229752226745, + "learning_rate": 5.883858403607967e-06, + "loss": 0.7013, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.4199909816108278, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6514, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.3903462015411026, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.6777, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.43867300626612943, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6588, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.4451731647413953, + "learning_rate": 5.538519351897575e-06, + "loss": 0.6186, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4177041897523593, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6913, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.42231456720258476, + "learning_rate": 5.369655545525909e-06, + "loss": 0.6489, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.44824163944843665, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6784, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.43471567283177637, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.7208, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.498934955818159, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6437, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.48580807688107563, + "learning_rate": 5.039562062965508e-06, + "loss": 0.668, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.43952589415837257, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6895, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.37149033642589935, + "learning_rate": 4.87834125814235e-06, + "loss": 0.67, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.4852786012636073, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6688, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.36469666185134314, + "learning_rate": 4.719676877632639e-06, + "loss": 0.651, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.44325768299991547, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6559, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.4827536457562476, + "learning_rate": 4.563573185591219e-06, + "loss": 0.6705, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3893119525845464, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6809, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.39947775593612334, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.5895, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4268305287341812, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6476, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.3904970988564565, + "learning_rate": 4.259064579323302e-06, + "loss": 0.6318, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.5030706139039645, + "learning_rate": 4.184544329761009e-06, + "loss": 0.711, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.448425693298618, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.6872, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.3676551531074703, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6875, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.48434382375615254, + "learning_rate": 3.964848174174541e-06, + "loss": 0.7207, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.36427386682071533, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6319, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.822845317066879, + "learning_rate": 3.821609474213983e-06, + "loss": 0.6233, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.37114922831083996, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6094, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.3754095183151161, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.6578, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.45078676903592013, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6827, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.4168990815173287, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.7397, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4507539159693322, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6386, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.39250543750014394, + "learning_rate": 3.40741737109318e-06, + "loss": 0.6932, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.40505225352855967, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6981, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.4182252224511026, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.7056, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4438636305914537, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7024, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.44466805445876056, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.6573, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.48050301828153263, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.682, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.5135694817857425, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.6818, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5023625394808598, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7109, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.40559397685771054, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.6128, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.48228319917260726, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.652, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.4028627412024973, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6748, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.4700410335749985, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7462, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.4492313961386215, + "learning_rate": 2.649217248223468e-06, + "loss": 0.6375, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.563654315078854, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6359, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.5057687381686432, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6714, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.4652536601650651, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7764, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.3662343111276865, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.6893, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4416713848647298, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7013, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.381951299984924, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.6632, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.37155212677639327, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6821, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.46471185332141823, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.7189, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.4163494889726177, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5728, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.38152676389057105, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.6235, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.408199847432243, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7241, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.3855748055325008, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.6325, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.5347027654755969, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7415, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.3850161743575303, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.6479, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.39447167520068405, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6581, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.3954999351156733, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.627, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.47620026982022223, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.7013, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.47863278534676423, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6721, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.44263756236758506, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.705, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.48622880718242784, + "learning_rate": 1.595161589389449e-06, + "loss": 0.6606, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.4518729241142917, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6761, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.36692094947734094, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.6531, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.49770220770437756, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6987, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.386846496222818, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.6162, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.42295808925978085, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6378, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.515620590376939, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.6717, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4236383792998744, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.694, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.3745933440933498, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.6782, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.44548985289368986, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6824, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.351935666658701, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.635, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3927876866660279, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6844, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.4543533700238643, + "learning_rate": 1.089491988176017e-06, + "loss": 0.6545, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4726408949660419, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.764, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.4547022906964549, + "learning_rate": 1.014505010326583e-06, + "loss": 0.6669, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4386988408797006, + "learning_rate": 9.780089980330642e-07, + "loss": 0.7046, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.364811545576542, + "learning_rate": 9.421782985976068e-07, + "loss": 0.6357, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.38316319547541294, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6351, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.409523938404599, + "learning_rate": 8.725137967920738e-07, + "loss": 0.7255, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.3938547911550198, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6623, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.4130480095438861, + "learning_rate": 8.055133771652345e-07, + "loss": 0.7109, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.4607401947793766, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6699, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.4095248421137157, + "learning_rate": 7.411788403743237e-07, + "loss": 0.6493, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5025596720781565, + "learning_rate": 7.100118211581852e-07, + "loss": 0.7294, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.40212299450152705, + "learning_rate": 6.7951191543012e-07, + "loss": 0.706, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.4353245366674736, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7316, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.3289292410419503, + "learning_rate": 6.205142596505176e-07, + "loss": 0.5969, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4485381283671457, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6299, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.4350396855142743, + "learning_rate": 5.64187458615939e-07, + "loss": 0.6427, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.44820779403912475, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6739, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.45985438515252697, + "learning_rate": 5.105330261267916e-07, + "loss": 0.6196, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4155698862169645, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6562, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.35992935478436117, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.6739, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4210190191253645, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.7084, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.46506063315072393, + "learning_rate": 4.112469628438365e-07, + "loss": 0.641, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.45217811984125644, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6842, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.481584910999147, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.7273, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.44495631426659343, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6449, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.5867587286277899, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.7682, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.41093637951300743, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6484, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.3994694959002646, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.6421, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.8741986088770987, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6413, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.42549697370180717, + "learning_rate": 2.448018893333681e-07, + "loss": 0.6889, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.36065938333251124, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6689, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.40473093376739466, + "learning_rate": 2.098903854912515e-07, + "loss": 0.6713, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.38014814555165943, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.5928, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.4300530742116439, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.5796, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.36263394110562225, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6379, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.43216298500361494, + "learning_rate": 1.481139151579991e-07, + "loss": 0.6708, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.4035772994578955, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6585, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.46471278001401517, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6209, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.39742181119724646, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6714, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.4642886320610059, + "learning_rate": 9.707157531134713e-08, + "loss": 0.7515, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.3552316968541323, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7118, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.5832555040857262, + "learning_rate": 7.557746412468758e-08, + "loss": 0.702, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4445173718634954, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6818, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.36950466729677595, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6382, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.3563943247336583, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6319, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.44786114440122127, + "learning_rate": 4.064624751394242e-08, + "loss": 0.6989, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.37817784586678505, + "learning_rate": 3.359233507459481e-08, + "loss": 0.606, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.4459885744722213, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.6281, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.485956429014473, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7132, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.39125958924755905, + "learning_rate": 1.646071422083395e-08, + "loss": 0.6556, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4158554391756689, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6668, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.3993930174404647, + "learning_rate": 8.398436437317969e-09, + "loss": 0.7188, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.42408035043868864, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6531, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.47997507721109506, + "learning_rate": 3.023464202944748e-09, + "loss": 0.6357, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.42129152338144066, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5859, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.5059206793056723, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.6943, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.39015189642135556, + "learning_rate": 0.0, + "loss": 0.6774, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1075826133532672.0, + "train_loss": 0.7417825475215912, + "train_runtime": 19119.0811, + "train_samples_per_second": 1.046, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1075826133532672.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eb9d3b3f3084162eed13db43858cadfc73f167bc --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "k_proj", + "up_proj", + "o_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ba12f28e8d2e6b1ef6d1ea09692cd8c5d81c8b5a --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e1e7a62972a4ff09f208264f5f966c407f1561455825f5d80f41c82ce076c7c +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..4ed2b96d1aab89dc2713090062115cab8f9eeb4e --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:190eefee19dd4b71cfab88915cd3c3978b227d90c118cad6087b261cf0926aa1 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2147a9e04e5c7913b0682519cab83918500da535 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.8884397703958815, + "learning_rate": 5e-05, + "loss": 1.4159, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 1.0606920004031573, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.7084696914558983, + "learning_rate": 0.00015000000000000001, + "loss": 1.1484, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.7205956606575595, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.3784355426416521, + "learning_rate": 0.00019996629653035126, + "loss": 1.1161, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.7951498968301047, + "learning_rate": 0.00019986520883988232, + "loss": 1.0714, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.491689920077788, + "learning_rate": 0.00019969680506871137, + "loss": 0.8671, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.5745437616364344, + "learning_rate": 0.00019946119873266613, + "loss": 0.9945, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.5848934441972632, + "learning_rate": 0.00019915854864676664, + "loss": 0.943, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.569529067821101, + "learning_rate": 0.00019878905881817252, + "loss": 0.954, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.5674545381514693, + "learning_rate": 0.00019835297830866826, + "loss": 0.9394, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.6029677361735906, + "learning_rate": 0.00019785060106677818, + "loss": 0.8518, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.6336646808165951, + "learning_rate": 0.00019728226572962473, + "loss": 0.9159, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.5679261447513578, + "learning_rate": 0.0001966483553946637, + "loss": 0.8465, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.5308201822016704, + "learning_rate": 0.00019594929736144976, + "loss": 0.8743, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.5085526050026933, + "learning_rate": 0.00019518556284360696, + "loss": 0.9164, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.5142089161440812, + "learning_rate": 0.0001943576666511982, + "loss": 0.8633, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.6806076228504198, + "learning_rate": 0.0001934661668437073, + "loss": 0.9558, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.5181061101016946, + "learning_rate": 0.0001925116643538684, + "loss": 0.8307, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.5300147626591982, + "learning_rate": 0.00019149480258259533, + "loss": 0.8933, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.5303314473113463, + "learning_rate": 0.00019041626696528503, + "loss": 0.7733, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.5637808040036959, + "learning_rate": 0.0001892767845097864, + "loss": 0.9473, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.4928223181651132, + "learning_rate": 0.00018807712330634642, + "loss": 0.8076, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.46232268164750967, + "learning_rate": 0.0001868180920098644, + "loss": 0.8499, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.5801112496983529, + "learning_rate": 0.00018550053929480202, + "loss": 0.9608, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.5567373224539722, + "learning_rate": 0.00018412535328311814, + "loss": 0.8963, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.5151562065420886, + "learning_rate": 0.0001826934609456129, + "loss": 0.9003, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.5278724877114119, + "learning_rate": 0.00018120582747708502, + "loss": 0.9522, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.6208439508269434, + "learning_rate": 0.0001796634556457236, + "loss": 1.0636, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.3869070230027433, + "learning_rate": 0.0001780673851171728, + "loss": 0.7688, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.5264876039649024, + "learning_rate": 0.00017641869175372493, + "loss": 0.9261, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.5593455537479747, + "learning_rate": 0.00017471848688911464, + "loss": 1.0018, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.4808510845543371, + "learning_rate": 0.000172967916579403, + "loss": 0.8686, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.49696919912770154, + "learning_rate": 0.00017116816083045602, + "loss": 0.8674, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.4862474594477952, + "learning_rate": 0.0001693204328025389, + "loss": 0.8905, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.4180161696791824, + "learning_rate": 0.00016742597799256182, + "loss": 0.8139, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.4939094774176364, + "learning_rate": 0.00016548607339452853, + "loss": 0.8379, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.4253560025632094, + "learning_rate": 0.00016350202663875386, + "loss": 0.8151, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.5572204490964708, + "learning_rate": 0.0001614751751104301, + "loss": 0.892, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.4723541523805916, + "learning_rate": 0.00015940688504813662, + "loss": 0.8551, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.4568174408219518, + "learning_rate": 0.00015729855062290022, + "loss": 0.7905, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.447517908676915, + "learning_rate": 0.00015515159299842707, + "loss": 0.8477, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.4766204933369398, + "learning_rate": 0.00015296745937313987, + "loss": 0.8327, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.4499203108768281, + "learning_rate": 0.00015074762200466556, + "loss": 0.8197, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.528409694573524, + "learning_rate": 0.00014849357721743168, + "loss": 0.9223, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.72502564770408, + "learning_rate": 0.00014620684439403962, + "loss": 0.8012, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.5533119892695849, + "learning_rate": 0.0001438889649510956, + "loss": 0.8605, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.4049030465990177, + "learning_rate": 0.00014154150130018866, + "loss": 0.8528, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.4808870640382263, + "learning_rate": 0.00013916603579471705, + "loss": 0.87, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.52409820233413, + "learning_rate": 0.000136764169663272, + "loss": 0.8126, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.6056007216795501, + "learning_rate": 0.00013433752193029886, + "loss": 0.9253, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.4288099908418435, + "learning_rate": 0.00013188772832476188, + "loss": 0.7949, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.5189621718791575, + "learning_rate": 0.00012941644017754964, + "loss": 0.8815, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.45129110175069165, + "learning_rate": 0.00012692532330836346, + "loss": 0.786, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.6277596482686542, + "learning_rate": 0.00012441605690283915, + "loss": 0.9032, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.44406976016915317, + "learning_rate": 0.0001218903323806595, + "loss": 0.7808, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.4918218002370572, + "learning_rate": 0.00011934985225541998, + "loss": 0.8423, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.6049398582277967, + "learning_rate": 0.00011679632898701649, + "loss": 0.9006, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.49953351896163867, + "learning_rate": 0.00011423148382732853, + "loss": 0.8549, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.44150354205208875, + "learning_rate": 0.00011165704565997593, + "loss": 0.8135, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.44711567607942587, + "learning_rate": 0.00010907474983493144, + "loss": 0.7721, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.6576684715478033, + "learning_rate": 0.0001064863369987743, + "loss": 0.9037, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.45292352334579544, + "learning_rate": 0.00010389355192137377, + "loss": 0.8571, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.4241065430917788, + "learning_rate": 0.0001012981423197931, + "loss": 0.7847, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.3914694358431383, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7819, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.4322735032742982, + "learning_rate": 9.610644807862625e-05, + "loss": 0.8738, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.4309376434320946, + "learning_rate": 9.35136630012257e-05, + "loss": 0.8367, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.46566584064135313, + "learning_rate": 9.092525016506858e-05, + "loss": 0.8552, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.429308500201854, + "learning_rate": 8.83429543400241e-05, + "loss": 0.8101, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.587185938724539, + "learning_rate": 8.57685161726715e-05, + "loss": 0.8888, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.5843203522868754, + "learning_rate": 8.320367101298351e-05, + "loss": 0.967, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.4892032041186544, + "learning_rate": 8.065014774458003e-05, + "loss": 0.8529, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.6313546824278242, + "learning_rate": 7.810966761934053e-05, + "loss": 0.8197, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.46706294787211833, + "learning_rate": 7.558394309716088e-05, + "loss": 0.7694, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.5194463291566068, + "learning_rate": 7.307467669163655e-05, + "loss": 0.7919, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.5438788300581358, + "learning_rate": 7.058355982245037e-05, + "loss": 0.8728, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.5701658320866863, + "learning_rate": 6.811227167523815e-05, + "loss": 0.7784, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.5172945223466745, + "learning_rate": 6.566247806970119e-05, + "loss": 0.9284, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.45218007811343347, + "learning_rate": 6.323583033672799e-05, + "loss": 0.7992, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.45924177524997184, + "learning_rate": 6.083396420528298e-05, + "loss": 0.8009, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.6083016332888105, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7786, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.44739935065083747, + "learning_rate": 5.611103504890444e-05, + "loss": 0.8756, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.42971294075618266, + "learning_rate": 5.379315560596038e-05, + "loss": 0.8298, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.5815137727505052, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.8224, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.38147701842400894, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.7335, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.4322102435702118, + "learning_rate": 4.703254062686017e-05, + "loss": 0.7831, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.5089686624428282, + "learning_rate": 4.484840700157295e-05, + "loss": 0.8416, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.5505056218355707, + "learning_rate": 4.270144937709981e-05, + "loss": 0.8012, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.5070049831936402, + "learning_rate": 4.059311495186338e-05, + "loss": 0.8553, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.3899330884875655, + "learning_rate": 3.852482488956992e-05, + "loss": 0.7763, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.4056999323126134, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7543, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.4412253695071463, + "learning_rate": 3.45139266054715e-05, + "loss": 0.8095, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.4619018727704485, + "learning_rate": 3.257402200743821e-05, + "loss": 0.7423, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.49371015234697324, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.735, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.4298667400521944, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.758, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.44698443987236036, + "learning_rate": 2.7032083420597e-05, + "loss": 0.7787, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 1.192577588135792, + "learning_rate": 2.528151311088537e-05, + "loss": 0.9124, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.544145417635771, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7845, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.42939269218996073, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7219, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.44156059519958973, + "learning_rate": 2.03365443542764e-05, + "loss": 0.8113, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.44046523940845045, + "learning_rate": 1.879417252291502e-05, + "loss": 0.8134, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.5488593124276808, + "learning_rate": 1.730653905438714e-05, + "loss": 0.8677, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.5225130363831496, + "learning_rate": 1.587464671688187e-05, + "loss": 0.768, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.4608618963436507, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.7307, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.4858313632617286, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.8246, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.5168158601625553, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.8962, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.38964343140870383, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7524, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.44153770705871775, + "learning_rate": 9.583733034714981e-06, + "loss": 0.8091, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.4201989209720781, + "learning_rate": 8.505197417404687e-06, + "loss": 0.7202, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.5585657269663398, + "learning_rate": 7.488335646131628e-06, + "loss": 0.8157, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.39368801093203876, + "learning_rate": 6.533833156292679e-06, + "loss": 0.6741, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.5064472219894975, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.8613, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.47217273527245207, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.8323, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.4526610600568625, + "learning_rate": 4.050702638550275e-06, + "loss": 0.836, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.4287421695372561, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.7581, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.5910599950264193, + "learning_rate": 2.717734270375272e-06, + "loss": 0.8301, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.4205494298112779, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.721, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.4114647219715256, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.7665, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.43372997488992915, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.8017, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.4956446279256857, + "learning_rate": 8.41451353233369e-07, + "loss": 0.823, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.5412074518102485, + "learning_rate": 5.388012673338661e-07, + "loss": 0.8656, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.4602496517708938, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7907, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.4224483839145656, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7249, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.38290363636099745, + "learning_rate": 3.370346964876036e-08, + "loss": 0.715, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.4116424800917143, + "learning_rate": 0.0, + "loss": 0.8042, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 108802645983232.0, + "train_loss": 0.8561052279472351, + "train_runtime": 1918.9923, + "train_samples_per_second": 1.042, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 108802645983232.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ef5b433c4e8a8c22c5322be3073838a5d29dd92a --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "gate_proj", + "o_proj", + "k_proj", + "up_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..af0834266e95e92cb8662ec320fe4104a49f6419 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098e9c0d4365d730afb30afd26c20651b3b6d367b43b15c52091b7c2da9f2fe2 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..fb63f3087bab7382842ba324cedeb7a7aeafb292 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd552e8627b7e5460c4b47166f452609fa9e49424e6b38e9ab8b059a7f44c099 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b4e9cd70429134f1eff34f2c9a87dd3192592245 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,476 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.992, + "eval_steps": 500, + "global_step": 62, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 0.8881067032933168, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 0.7770342479329756, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 0.9220773230190236, + "learning_rate": 0.0001998629534754574, + "loss": 1.2649, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 0.8155613335304368, + "learning_rate": 0.00019945218953682734, + "loss": 1.0363, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.5322614055457643, + "learning_rate": 0.00019876883405951377, + "loss": 0.9997, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.44105392561516205, + "learning_rate": 0.00019781476007338058, + "loss": 0.9386, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.7833219433873575, + "learning_rate": 0.00019659258262890683, + "loss": 0.9249, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.745363100150533, + "learning_rate": 0.00019510565162951537, + "loss": 0.927, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.4819461525156197, + "learning_rate": 0.00019335804264972018, + "loss": 0.9433, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.7445630307920716, + "learning_rate": 0.0001913545457642601, + "loss": 0.9034, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.4924623708308096, + "learning_rate": 0.0001891006524188368, + "loss": 0.8899, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.403071806760822, + "learning_rate": 0.00018660254037844388, + "loss": 0.8527, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.4430564803213766, + "learning_rate": 0.00018386705679454242, + "loss": 0.9423, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.4014569573385412, + "learning_rate": 0.00018090169943749476, + "loss": 0.9413, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.44879724353918893, + "learning_rate": 0.0001777145961456971, + "loss": 0.9282, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.4431764500501959, + "learning_rate": 0.00017431448254773944, + "loss": 0.98, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.43604774988284023, + "learning_rate": 0.00017071067811865476, + "loss": 0.8848, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.41322803717963497, + "learning_rate": 0.00016691306063588583, + "loss": 0.8688, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.3438188514971592, + "learning_rate": 0.00016293203910498376, + "loss": 0.8404, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.4206534215903609, + "learning_rate": 0.00015877852522924732, + "loss": 0.8831, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.3738207537435881, + "learning_rate": 0.00015446390350150273, + "loss": 0.8456, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.3330511764130146, + "learning_rate": 0.00015000000000000001, + "loss": 0.8315, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.4268615101234299, + "learning_rate": 0.00014539904997395468, + "loss": 0.864, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.37305624152931305, + "learning_rate": 0.00014067366430758004, + "loss": 0.866, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.4441814226993271, + "learning_rate": 0.00013583679495453, + "loss": 0.8582, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.4018483090042229, + "learning_rate": 0.00013090169943749476, + "loss": 0.8736, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.4021252408952179, + "learning_rate": 0.00012588190451025207, + "loss": 0.8381, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.42098023005943175, + "learning_rate": 0.00012079116908177593, + "loss": 0.8531, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.49010442252024883, + "learning_rate": 0.0001156434465040231, + "loss": 0.8778, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.3425843067375386, + "learning_rate": 0.00011045284632676536, + "loss": 0.8429, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.41190711302446653, + "learning_rate": 0.0001052335956242944, + "loss": 0.8413, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.3228174068831545, + "learning_rate": 0.0001, + "loss": 0.8297, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.4895637910198642, + "learning_rate": 9.476640437570562e-05, + "loss": 0.8342, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.34085577946149404, + "learning_rate": 8.954715367323468e-05, + "loss": 0.8517, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.37467874711346916, + "learning_rate": 8.435655349597689e-05, + "loss": 0.8541, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.3564264425585218, + "learning_rate": 7.920883091822408e-05, + "loss": 0.9149, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.4019276143627365, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8024, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.3803497603106543, + "learning_rate": 6.909830056250527e-05, + "loss": 0.8459, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.36057301404640224, + "learning_rate": 6.416320504546997e-05, + "loss": 0.8604, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.331752868629376, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.8089, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.3529189783563381, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.8371, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.3853558526586511, + "learning_rate": 5.000000000000002e-05, + "loss": 0.837, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.32720251243737186, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.7666, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.33512069371273234, + "learning_rate": 4.12214747707527e-05, + "loss": 0.829, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.33183884892489623, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.8323, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.3138210466118749, + "learning_rate": 3.308693936411421e-05, + "loss": 0.797, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.3529433722657763, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7543, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.33443578542259444, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.7823, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.4351743122080329, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.8616, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.31866210190267136, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.78, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.35992837737853833, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.8502, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.3685387261092368, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7642, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.365422078433516, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.8737, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.29802642872061985, + "learning_rate": 8.645454235739903e-06, + "loss": 0.7968, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.3479896048917906, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.7792, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.3341330140937082, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.7838, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.33890270082577884, + "learning_rate": 3.40741737109318e-06, + "loss": 0.8476, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.3855122596118422, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.8064, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.3072954456332672, + "learning_rate": 1.231165940486234e-06, + "loss": 0.7585, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.32832280350096327, + "learning_rate": 5.478104631726711e-07, + "loss": 0.8263, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.432110699458833, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.8444, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.2950937605671529, + "learning_rate": 0.0, + "loss": 0.7306, + "step": 62 + }, + { + "epoch": 0.992, + "step": 62, + "total_flos": 156705435025408.0, + "train_loss": 0.8773936042862553, + "train_runtime": 1891.0897, + "train_samples_per_second": 1.058, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 62, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 156705435025408.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a362cdc6ced0373326ecc6d295eda4a1f3de46db --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "o_proj", + "up_proj", + "v_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f4380f4bd32963fe315e471f9e5ade402f303390 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5e8784ade77dde109b97f7f0c0f9930d41c11c4d0b5ae9879871d8a71f482b9 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2888fabbc04e6a99829c2f20d51e95bcfff05706 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cace5c78df0d415786b3c943fe25ccda287dd626adea57f389fdb8761dee8337 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4fd41b48b605770a356eb324056c4adf1dd1e5c1 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.8761512161539348, + "learning_rate": 5e-05, + "loss": 1.4159, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 1.050411291756925, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.7042707932654642, + "learning_rate": 0.00015000000000000001, + "loss": 1.1491, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.7180995438734898, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.365840006786395, + "learning_rate": 0.00019996629653035126, + "loss": 1.117, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.7805949745223308, + "learning_rate": 0.00019986520883988232, + "loss": 1.0718, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.48790456350172695, + "learning_rate": 0.00019969680506871137, + "loss": 0.8674, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.569211715314236, + "learning_rate": 0.00019946119873266613, + "loss": 0.9944, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.6438576703248184, + "learning_rate": 0.00019915854864676664, + "loss": 0.9435, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.5691926039461578, + "learning_rate": 0.00019878905881817252, + "loss": 0.9537, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.5735710492905692, + "learning_rate": 0.00019835297830866826, + "loss": 0.9398, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.5788683720151458, + "learning_rate": 0.00019785060106677818, + "loss": 0.8513, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.6066358529501978, + "learning_rate": 0.00019728226572962473, + "loss": 0.916, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.7458357794862776, + "learning_rate": 0.0001966483553946637, + "loss": 0.847, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.5091599140879502, + "learning_rate": 0.00019594929736144976, + "loss": 0.8748, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.5021115561896219, + "learning_rate": 0.00019518556284360696, + "loss": 0.916, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.49670110297886205, + "learning_rate": 0.0001943576666511982, + "loss": 0.8618, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.6031073742650186, + "learning_rate": 0.0001934661668437073, + "loss": 0.9549, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.4695881467204669, + "learning_rate": 0.0001925116643538684, + "loss": 0.8321, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.5447846782765617, + "learning_rate": 0.00019149480258259533, + "loss": 0.8936, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.6433587271879366, + "learning_rate": 0.00019041626696528503, + "loss": 0.7734, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.569068431075782, + "learning_rate": 0.0001892767845097864, + "loss": 0.9471, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.504188410609563, + "learning_rate": 0.00018807712330634642, + "loss": 0.8076, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.45271163917212337, + "learning_rate": 0.0001868180920098644, + "loss": 0.8493, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.5882580444650141, + "learning_rate": 0.00018550053929480202, + "loss": 0.962, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.563830482355529, + "learning_rate": 0.00018412535328311814, + "loss": 0.8956, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.5110945514147994, + "learning_rate": 0.0001826934609456129, + "loss": 0.8988, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.4977997445045047, + "learning_rate": 0.00018120582747708502, + "loss": 0.9509, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.656914022247126, + "learning_rate": 0.0001796634556457236, + "loss": 1.0623, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.39541950218179106, + "learning_rate": 0.0001780673851171728, + "loss": 0.7705, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.5298013165099509, + "learning_rate": 0.00017641869175372493, + "loss": 0.927, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.5581973615381166, + "learning_rate": 0.00017471848688911464, + "loss": 1.0056, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.4863895130531955, + "learning_rate": 0.000172967916579403, + "loss": 0.8686, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.5216196632515393, + "learning_rate": 0.00017116816083045602, + "loss": 0.8659, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.4709873315394201, + "learning_rate": 0.0001693204328025389, + "loss": 0.8897, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.42140975185568336, + "learning_rate": 0.00016742597799256182, + "loss": 0.8153, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.4850686126078359, + "learning_rate": 0.00016548607339452853, + "loss": 0.839, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.42663092480266446, + "learning_rate": 0.00016350202663875386, + "loss": 0.8168, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.53592925059155, + "learning_rate": 0.0001614751751104301, + "loss": 0.8923, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.4776274769679771, + "learning_rate": 0.00015940688504813662, + "loss": 0.8552, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.4525156648112374, + "learning_rate": 0.00015729855062290022, + "loss": 0.7905, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.44609347704483965, + "learning_rate": 0.00015515159299842707, + "loss": 0.8436, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.46291422204767446, + "learning_rate": 0.00015296745937313987, + "loss": 0.831, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.43426351508110217, + "learning_rate": 0.00015074762200466556, + "loss": 0.8187, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.5096568332860442, + "learning_rate": 0.00014849357721743168, + "loss": 0.9222, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.6195395152770948, + "learning_rate": 0.00014620684439403962, + "loss": 0.7948, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.5489690096452822, + "learning_rate": 0.0001438889649510956, + "loss": 0.861, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.4175506705673952, + "learning_rate": 0.00014154150130018866, + "loss": 0.8515, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.4650054032567586, + "learning_rate": 0.00013916603579471705, + "loss": 0.8685, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.5081807510201565, + "learning_rate": 0.000136764169663272, + "loss": 0.8163, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.6056212320947826, + "learning_rate": 0.00013433752193029886, + "loss": 0.9239, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.47480587943042707, + "learning_rate": 0.00013188772832476188, + "loss": 0.7968, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.5429809344267971, + "learning_rate": 0.00012941644017754964, + "loss": 0.8816, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.4439436498680593, + "learning_rate": 0.00012692532330836346, + "loss": 0.7849, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.5744111918919593, + "learning_rate": 0.00012441605690283915, + "loss": 0.8998, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.4364047448911903, + "learning_rate": 0.0001218903323806595, + "loss": 0.7857, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.4896415969107107, + "learning_rate": 0.00011934985225541998, + "loss": 0.8444, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.6044564082237883, + "learning_rate": 0.00011679632898701649, + "loss": 0.9029, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.4677944001963764, + "learning_rate": 0.00011423148382732853, + "loss": 0.8561, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.423434247928326, + "learning_rate": 0.00011165704565997593, + "loss": 0.8139, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.43531524490158446, + "learning_rate": 0.00010907474983493144, + "loss": 0.7697, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.61656973701804, + "learning_rate": 0.0001064863369987743, + "loss": 0.9032, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.41572884209120664, + "learning_rate": 0.00010389355192137377, + "loss": 0.8574, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.4098257972778533, + "learning_rate": 0.0001012981423197931, + "loss": 0.7845, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.3966625098251103, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7811, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.4344170465996877, + "learning_rate": 9.610644807862625e-05, + "loss": 0.8776, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.4276380086198874, + "learning_rate": 9.35136630012257e-05, + "loss": 0.8388, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.4674414321713914, + "learning_rate": 9.092525016506858e-05, + "loss": 0.8589, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.41318359890735357, + "learning_rate": 8.83429543400241e-05, + "loss": 0.8143, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.5788385254147412, + "learning_rate": 8.57685161726715e-05, + "loss": 0.8897, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.5405431977766236, + "learning_rate": 8.320367101298351e-05, + "loss": 0.9667, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.47114343948575044, + "learning_rate": 8.065014774458003e-05, + "loss": 0.8514, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.6246980866082275, + "learning_rate": 7.810966761934053e-05, + "loss": 0.8199, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.458244114171704, + "learning_rate": 7.558394309716088e-05, + "loss": 0.7683, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.5054436738510697, + "learning_rate": 7.307467669163655e-05, + "loss": 0.794, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.514804478607465, + "learning_rate": 7.058355982245037e-05, + "loss": 0.8751, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.44096381256801, + "learning_rate": 6.811227167523815e-05, + "loss": 0.7778, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.5144965548813178, + "learning_rate": 6.566247806970119e-05, + "loss": 0.9338, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.4378295019083376, + "learning_rate": 6.323583033672799e-05, + "loss": 0.7987, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.45383973182342063, + "learning_rate": 6.083396420528298e-05, + "loss": 0.8021, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.45819642148586637, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7742, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.4516052417805142, + "learning_rate": 5.611103504890444e-05, + "loss": 0.8765, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.4408962164195109, + "learning_rate": 5.379315560596038e-05, + "loss": 0.8315, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.649416453323441, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.822, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.38103530904658756, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.7313, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.4408418986490477, + "learning_rate": 4.703254062686017e-05, + "loss": 0.7804, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.4777463565487696, + "learning_rate": 4.484840700157295e-05, + "loss": 0.8402, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.3918054071841872, + "learning_rate": 4.270144937709981e-05, + "loss": 0.8028, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.49300712253228707, + "learning_rate": 4.059311495186338e-05, + "loss": 0.8599, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.35718980891187124, + "learning_rate": 3.852482488956992e-05, + "loss": 0.7782, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.422830451467894, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7543, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.4300306903126699, + "learning_rate": 3.45139266054715e-05, + "loss": 0.8066, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.4164786718677969, + "learning_rate": 3.257402200743821e-05, + "loss": 0.7428, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.48481755337412774, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.7382, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.4288045666562381, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.7569, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.42925681786070835, + "learning_rate": 2.7032083420597e-05, + "loss": 0.7759, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.7704343295584768, + "learning_rate": 2.528151311088537e-05, + "loss": 0.9136, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.5260656963642337, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7824, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.43132606359606707, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7234, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.42908704681963655, + "learning_rate": 2.03365443542764e-05, + "loss": 0.8099, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.41398213676404483, + "learning_rate": 1.879417252291502e-05, + "loss": 0.8135, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.5811640214111465, + "learning_rate": 1.730653905438714e-05, + "loss": 0.8683, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.5444268920907986, + "learning_rate": 1.587464671688187e-05, + "loss": 0.7694, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.47278317044341817, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.7332, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.46965595254274717, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.8266, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.49441143115174263, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.8984, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.38642163233312743, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7505, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.3923023476439001, + "learning_rate": 9.583733034714981e-06, + "loss": 0.8077, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.43042764386880217, + "learning_rate": 8.505197417404687e-06, + "loss": 0.7211, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.5049229468813194, + "learning_rate": 7.488335646131628e-06, + "loss": 0.8149, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.3982449153940663, + "learning_rate": 6.533833156292679e-06, + "loss": 0.674, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.5025847657970963, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.8631, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.4614716302588619, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.8314, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.45098796276315717, + "learning_rate": 4.050702638550275e-06, + "loss": 0.8341, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.4716224685687741, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.7573, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.5912212050693847, + "learning_rate": 2.717734270375272e-06, + "loss": 0.8294, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.41030071817831365, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.7215, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.4448213077659771, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.7652, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.44403906442933844, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.8029, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.4561204266052629, + "learning_rate": 8.41451353233369e-07, + "loss": 0.8229, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.5281674345762041, + "learning_rate": 5.388012673338661e-07, + "loss": 0.8658, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.4788575110845754, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7892, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.4274777825254005, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7265, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.3849614574513801, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7138, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.41576314638434686, + "learning_rate": 0.0, + "loss": 0.8058, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 108802645983232.0, + "train_loss": 0.8562675738334655, + "train_runtime": 1916.1262, + "train_samples_per_second": 1.044, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 108802645983232.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8223bbc9d4cfe40a6e4689d0ebc22ac5333f2fee --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..086fb413b4f1ce56380d39826209c56704018d42 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1e61d2d7404d40c644ae5ad81cc28471577c720faf1a08d3b73334a331a5c5b +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2decdb525660fba77353047c761c17f5e578a364 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41c6d522067de7256ddf8ad16476dcd3bf63496b9ac6a049a6d232143081056c +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..df9bac2ea501484355d775e0df6cc31047895203 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_2_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,476 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.992, + "eval_steps": 500, + "global_step": 62, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 0.8733075285737993, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 0.7626200098479858, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 0.9150383578127355, + "learning_rate": 0.0001998629534754574, + "loss": 1.2654, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 0.8096181977268556, + "learning_rate": 0.00019945218953682734, + "loss": 1.0377, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.5351923550168005, + "learning_rate": 0.00019876883405951377, + "loss": 0.9988, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.4384781912498458, + "learning_rate": 0.00019781476007338058, + "loss": 0.9383, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.7226007969998041, + "learning_rate": 0.00019659258262890683, + "loss": 0.9236, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.9113171413285274, + "learning_rate": 0.00019510565162951537, + "loss": 0.9274, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.5024327863132997, + "learning_rate": 0.00019335804264972018, + "loss": 0.9444, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.7239658506567372, + "learning_rate": 0.0001913545457642601, + "loss": 0.9094, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.4996150518477214, + "learning_rate": 0.0001891006524188368, + "loss": 0.8923, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.40452327684044664, + "learning_rate": 0.00018660254037844388, + "loss": 0.8539, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.43841751967829246, + "learning_rate": 0.00018386705679454242, + "loss": 0.9433, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.41410096043074957, + "learning_rate": 0.00018090169943749476, + "loss": 0.9434, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.4184189668224682, + "learning_rate": 0.0001777145961456971, + "loss": 0.9297, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.4457447902412213, + "learning_rate": 0.00017431448254773944, + "loss": 0.9793, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.3818143910735278, + "learning_rate": 0.00017071067811865476, + "loss": 0.8853, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.3783548112578088, + "learning_rate": 0.00016691306063588583, + "loss": 0.8698, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.34701134667864025, + "learning_rate": 0.00016293203910498376, + "loss": 0.8403, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.4089284076654726, + "learning_rate": 0.00015877852522924732, + "loss": 0.8835, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.3704098957366238, + "learning_rate": 0.00015446390350150273, + "loss": 0.8466, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.3373174414785272, + "learning_rate": 0.00015000000000000001, + "loss": 0.8322, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.43441879102459396, + "learning_rate": 0.00014539904997395468, + "loss": 0.8635, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.41692608982811813, + "learning_rate": 0.00014067366430758004, + "loss": 0.8661, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.39519755910158144, + "learning_rate": 0.00013583679495453, + "loss": 0.857, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.39860172267169647, + "learning_rate": 0.00013090169943749476, + "loss": 0.8767, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.34911065356900484, + "learning_rate": 0.00012588190451025207, + "loss": 0.8397, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.4110063667874368, + "learning_rate": 0.00012079116908177593, + "loss": 0.8537, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.4020608405658079, + "learning_rate": 0.0001156434465040231, + "loss": 0.8779, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.3281161783726237, + "learning_rate": 0.00011045284632676536, + "loss": 0.8438, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.39975007050586536, + "learning_rate": 0.0001052335956242944, + "loss": 0.8427, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.3045871941817785, + "learning_rate": 0.0001, + "loss": 0.8293, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.3131284149787335, + "learning_rate": 9.476640437570562e-05, + "loss": 0.835, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.3324456643384695, + "learning_rate": 8.954715367323468e-05, + "loss": 0.8543, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.3627683766090873, + "learning_rate": 8.435655349597689e-05, + "loss": 0.8516, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.38696135375325097, + "learning_rate": 7.920883091822408e-05, + "loss": 0.9156, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.45486873904707503, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8027, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.38133658750527805, + "learning_rate": 6.909830056250527e-05, + "loss": 0.8479, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.37712780316547756, + "learning_rate": 6.416320504546997e-05, + "loss": 0.8609, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.34231024678456085, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.8113, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.35345370336204857, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.8387, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.3746605723300753, + "learning_rate": 5.000000000000002e-05, + "loss": 0.8365, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.34761992672468933, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.7645, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.3407070364965428, + "learning_rate": 4.12214747707527e-05, + "loss": 0.8283, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.35823118986429253, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.8343, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.31586195949350904, + "learning_rate": 3.308693936411421e-05, + "loss": 0.7969, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.3346111912665213, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7553, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.32509412696237266, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.7828, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.4154838364632596, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.861, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.33041909272930825, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.7807, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.35766555810690437, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.851, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.3647144892579273, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7653, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.355996832861928, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.8726, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.29079425564647043, + "learning_rate": 8.645454235739903e-06, + "loss": 0.7976, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.34186894289874137, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.7814, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.32976235250953534, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.7826, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.336610764133243, + "learning_rate": 3.40741737109318e-06, + "loss": 0.847, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.44920111583511707, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.8073, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.303280682511107, + "learning_rate": 1.231165940486234e-06, + "loss": 0.7587, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.33694982931979206, + "learning_rate": 5.478104631726711e-07, + "loss": 0.8262, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.37592344631210955, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.8454, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.3106932127809868, + "learning_rate": 0.0, + "loss": 0.732, + "step": 62 + }, + { + "epoch": 0.992, + "step": 62, + "total_flos": 156705435025408.0, + "train_loss": 0.87800558824693, + "train_runtime": 1890.0284, + "train_samples_per_second": 1.058, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 62, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 156705435025408.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..212513f92bc58f25f51cf6ad20b940b533d87927 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "down_proj", + "up_proj", + "k_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f19d6a487bbb69fb73a2fb9ab5d1a5d228c9d80f --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:972b29cd4005ef2d7fdae4bbda94735be58210f4ad6bd0120728509dff4d0ece +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..e07fa1cbb481a9260e9730e10bdc64b284e0bff2 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:837a86515142dba5f3b3a152dd870395470a4649be3885ec4837d31a27d1066d +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..604095031c07b3b7b54574b0abcfe869efe56a53 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.8879978778964512, + "learning_rate": 5e-05, + "loss": 1.4159, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 1.0640587353265116, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.7072842736256486, + "learning_rate": 0.00015000000000000001, + "loss": 1.1483, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.7278177254379742, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.3856128325715313, + "learning_rate": 0.00019996629653035126, + "loss": 1.1177, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.7962273126153702, + "learning_rate": 0.00019986520883988232, + "loss": 1.0713, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.4851005926062388, + "learning_rate": 0.00019969680506871137, + "loss": 0.8668, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.5704754468532358, + "learning_rate": 0.00019946119873266613, + "loss": 0.9944, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.6370909561340997, + "learning_rate": 0.00019915854864676664, + "loss": 0.9445, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.5761117959066886, + "learning_rate": 0.00019878905881817252, + "loss": 0.9539, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.5617353727973317, + "learning_rate": 0.00019835297830866826, + "loss": 0.9408, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.6024206513926328, + "learning_rate": 0.00019785060106677818, + "loss": 0.8523, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.6260623836282878, + "learning_rate": 0.00019728226572962473, + "loss": 0.9149, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.5506502816329165, + "learning_rate": 0.0001966483553946637, + "loss": 0.8466, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.5118404705648577, + "learning_rate": 0.00019594929736144976, + "loss": 0.8748, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.5023021511003569, + "learning_rate": 0.00019518556284360696, + "loss": 0.9148, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.4929964716691261, + "learning_rate": 0.0001943576666511982, + "loss": 0.8617, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.6016874638384124, + "learning_rate": 0.0001934661668437073, + "loss": 0.9531, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.7943588764253823, + "learning_rate": 0.0001925116643538684, + "loss": 0.8304, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.5624067044042429, + "learning_rate": 0.00019149480258259533, + "loss": 0.8937, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.5137368238230682, + "learning_rate": 0.00019041626696528503, + "loss": 0.7733, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.5984174701135948, + "learning_rate": 0.0001892767845097864, + "loss": 0.9474, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.49681397248538706, + "learning_rate": 0.00018807712330634642, + "loss": 0.8082, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.454965958486758, + "learning_rate": 0.0001868180920098644, + "loss": 0.8496, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.5985658301217804, + "learning_rate": 0.00018550053929480202, + "loss": 0.9601, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.5627873164605425, + "learning_rate": 0.00018412535328311814, + "loss": 0.8974, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.5026276671463572, + "learning_rate": 0.0001826934609456129, + "loss": 0.9, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.486593416327982, + "learning_rate": 0.00018120582747708502, + "loss": 0.9505, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.6283343501267957, + "learning_rate": 0.0001796634556457236, + "loss": 1.0609, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.42772552574072337, + "learning_rate": 0.0001780673851171728, + "loss": 0.7684, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.5736507946552474, + "learning_rate": 0.00017641869175372493, + "loss": 0.9267, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.5790647433139106, + "learning_rate": 0.00017471848688911464, + "loss": 1.0016, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.47462305797922866, + "learning_rate": 0.000172967916579403, + "loss": 0.8663, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.5666313504012336, + "learning_rate": 0.00017116816083045602, + "loss": 0.871, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.4773490934060742, + "learning_rate": 0.0001693204328025389, + "loss": 0.8904, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.4234812230255282, + "learning_rate": 0.00016742597799256182, + "loss": 0.8139, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.4990989020796822, + "learning_rate": 0.00016548607339452853, + "loss": 0.8389, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.4770749552433828, + "learning_rate": 0.00016350202663875386, + "loss": 0.8146, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.5428985898936901, + "learning_rate": 0.0001614751751104301, + "loss": 0.8911, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.48556669182308065, + "learning_rate": 0.00015940688504813662, + "loss": 0.856, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.45066101718445134, + "learning_rate": 0.00015729855062290022, + "loss": 0.7889, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.5596146939817267, + "learning_rate": 0.00015515159299842707, + "loss": 0.8416, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.4637305909907296, + "learning_rate": 0.00015296745937313987, + "loss": 0.8317, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.45397598252864746, + "learning_rate": 0.00015074762200466556, + "loss": 0.8197, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.5269813475211595, + "learning_rate": 0.00014849357721743168, + "loss": 0.9236, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.6433775232426182, + "learning_rate": 0.00014620684439403962, + "loss": 0.7944, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.5740391756017929, + "learning_rate": 0.0001438889649510956, + "loss": 0.8614, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.4145784713265135, + "learning_rate": 0.00014154150130018866, + "loss": 0.8548, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.47076052554668874, + "learning_rate": 0.00013916603579471705, + "loss": 0.8696, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.5383204455379367, + "learning_rate": 0.000136764169663272, + "loss": 0.8132, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.6316835168318365, + "learning_rate": 0.00013433752193029886, + "loss": 0.9257, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.4389686857107661, + "learning_rate": 0.00013188772832476188, + "loss": 0.7944, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.528528046003657, + "learning_rate": 0.00012941644017754964, + "loss": 0.88, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.4515615106184379, + "learning_rate": 0.00012692532330836346, + "loss": 0.79, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.5638075965910785, + "learning_rate": 0.00012441605690283915, + "loss": 0.9044, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.4483567448450639, + "learning_rate": 0.0001218903323806595, + "loss": 0.7843, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.4893105736697421, + "learning_rate": 0.00011934985225541998, + "loss": 0.8439, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.7910093036400055, + "learning_rate": 0.00011679632898701649, + "loss": 0.9019, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.4593444659971679, + "learning_rate": 0.00011423148382732853, + "loss": 0.8549, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.4526634408729684, + "learning_rate": 0.00011165704565997593, + "loss": 0.8131, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.437064926080817, + "learning_rate": 0.00010907474983493144, + "loss": 0.7716, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.6253017004724702, + "learning_rate": 0.0001064863369987743, + "loss": 0.9032, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.4339983184927352, + "learning_rate": 0.00010389355192137377, + "loss": 0.8569, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.4266326998319048, + "learning_rate": 0.0001012981423197931, + "loss": 0.7823, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.42051755007256886, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7825, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.44408921014152003, + "learning_rate": 9.610644807862625e-05, + "loss": 0.8759, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.40700600974757534, + "learning_rate": 9.35136630012257e-05, + "loss": 0.8383, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.4517769595509566, + "learning_rate": 9.092525016506858e-05, + "loss": 0.8548, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.41724766653934686, + "learning_rate": 8.83429543400241e-05, + "loss": 0.8084, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.5579726731830523, + "learning_rate": 8.57685161726715e-05, + "loss": 0.8889, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.5308010558194547, + "learning_rate": 8.320367101298351e-05, + "loss": 0.9703, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.4267154953057778, + "learning_rate": 8.065014774458003e-05, + "loss": 0.8524, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.8832196614039924, + "learning_rate": 7.810966761934053e-05, + "loss": 0.8189, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.4759374903354764, + "learning_rate": 7.558394309716088e-05, + "loss": 0.7674, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.590099439862415, + "learning_rate": 7.307467669163655e-05, + "loss": 0.7925, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.5315958714433993, + "learning_rate": 7.058355982245037e-05, + "loss": 0.8744, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.4757057861756777, + "learning_rate": 6.811227167523815e-05, + "loss": 0.7804, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.5836975764080544, + "learning_rate": 6.566247806970119e-05, + "loss": 0.9328, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.45735269301853904, + "learning_rate": 6.323583033672799e-05, + "loss": 0.8001, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.45530769593290443, + "learning_rate": 6.083396420528298e-05, + "loss": 0.8025, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.49428441937770373, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7787, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.4457052001040438, + "learning_rate": 5.611103504890444e-05, + "loss": 0.8766, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.4708964909784785, + "learning_rate": 5.379315560596038e-05, + "loss": 0.8302, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.5617921551900704, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.8202, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.39166989111298783, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.7312, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.4285481525663216, + "learning_rate": 4.703254062686017e-05, + "loss": 0.7795, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.48008755427076916, + "learning_rate": 4.484840700157295e-05, + "loss": 0.8391, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.4186970025700412, + "learning_rate": 4.270144937709981e-05, + "loss": 0.8024, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.4926271179696444, + "learning_rate": 4.059311495186338e-05, + "loss": 0.8561, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.3603863978855644, + "learning_rate": 3.852482488956992e-05, + "loss": 0.7751, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.4037378309144934, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7526, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.4306313376911504, + "learning_rate": 3.45139266054715e-05, + "loss": 0.8085, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.42392952585080557, + "learning_rate": 3.257402200743821e-05, + "loss": 0.7435, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.520342827531211, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.7343, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.47349477569468973, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.757, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.45447378956605405, + "learning_rate": 2.7032083420597e-05, + "loss": 0.7797, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.7328627041998275, + "learning_rate": 2.528151311088537e-05, + "loss": 0.9105, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.5035338569997073, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7852, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.5665842460961545, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7229, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.4494589285982409, + "learning_rate": 2.03365443542764e-05, + "loss": 0.8121, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.43254498390631047, + "learning_rate": 1.879417252291502e-05, + "loss": 0.815, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.617162777110529, + "learning_rate": 1.730653905438714e-05, + "loss": 0.8678, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.5279563176534565, + "learning_rate": 1.587464671688187e-05, + "loss": 0.7656, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.4863656459408922, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.7324, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.46759428694355865, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.8252, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.5053624111570554, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.8967, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.39953892951805164, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7525, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.39344461115660584, + "learning_rate": 9.583733034714981e-06, + "loss": 0.8081, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.41587291844449736, + "learning_rate": 8.505197417404687e-06, + "loss": 0.72, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.5107460153618854, + "learning_rate": 7.488335646131628e-06, + "loss": 0.8137, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.4283624775790327, + "learning_rate": 6.533833156292679e-06, + "loss": 0.6745, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.49728016420382526, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.8618, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.4631269586439, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.8318, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.46178615828089614, + "learning_rate": 4.050702638550275e-06, + "loss": 0.8361, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.562034028031936, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.7564, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.5856105220033522, + "learning_rate": 2.717734270375272e-06, + "loss": 0.8278, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.41761741807300334, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.7205, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.419373493627604, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.7671, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.4406709425252194, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.7999, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.47182624579616544, + "learning_rate": 8.41451353233369e-07, + "loss": 0.8259, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.5641461266322076, + "learning_rate": 5.388012673338661e-07, + "loss": 0.869, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.4480973058440261, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.789, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.44802480682065154, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7251, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.3824694645024767, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7126, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.6799359587143076, + "learning_rate": 0.0, + "loss": 0.806, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 108802645983232.0, + "train_loss": 0.8560487117767334, + "train_runtime": 1917.3264, + "train_samples_per_second": 1.043, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 108802645983232.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ced61a90bfa4f993cfdc31fc5336ee45b8b47bb --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "v_proj", + "q_proj", + "k_proj", + "down_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..169908976f9af289dce363eb08da88f3b0610d42 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22c060e9cf31c64d068903084611c41e12212e6f2260b5294053f8de5ff057b5 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..846b235c71968a2ab5144038ab426ec1a171b972 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86d111db45517ea0f1c039b14d931edba26238ac1a7b6b84cd68ec71e83040b1 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..900fc95de75dd016686c93138eeb39f3a98468ec --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_2000_repeat_3_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,476 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.992, + "eval_steps": 500, + "global_step": 62, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 0.8867986198698434, + "learning_rate": 0.0001, + "loss": 1.4945, + "step": 1 + }, + { + "epoch": 0.032, + "grad_norm": 0.7738820351403444, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 2 + }, + { + "epoch": 0.048, + "grad_norm": 0.9266735811948859, + "learning_rate": 0.0001998629534754574, + "loss": 1.265, + "step": 3 + }, + { + "epoch": 0.064, + "grad_norm": 0.8094614882041049, + "learning_rate": 0.00019945218953682734, + "loss": 1.0362, + "step": 4 + }, + { + "epoch": 0.08, + "grad_norm": 0.5320508925575772, + "learning_rate": 0.00019876883405951377, + "loss": 0.9992, + "step": 5 + }, + { + "epoch": 0.096, + "grad_norm": 0.43898395144316243, + "learning_rate": 0.00019781476007338058, + "loss": 0.9388, + "step": 6 + }, + { + "epoch": 0.112, + "grad_norm": 0.7153392139613666, + "learning_rate": 0.00019659258262890683, + "loss": 0.9224, + "step": 7 + }, + { + "epoch": 0.128, + "grad_norm": 0.9509151430023896, + "learning_rate": 0.00019510565162951537, + "loss": 0.9274, + "step": 8 + }, + { + "epoch": 0.144, + "grad_norm": 0.5057363843742055, + "learning_rate": 0.00019335804264972018, + "loss": 0.9442, + "step": 9 + }, + { + "epoch": 0.16, + "grad_norm": 0.7551699518116433, + "learning_rate": 0.0001913545457642601, + "loss": 0.9086, + "step": 10 + }, + { + "epoch": 0.176, + "grad_norm": 0.4975706847682533, + "learning_rate": 0.0001891006524188368, + "loss": 0.8915, + "step": 11 + }, + { + "epoch": 0.192, + "grad_norm": 0.40630493493866054, + "learning_rate": 0.00018660254037844388, + "loss": 0.852, + "step": 12 + }, + { + "epoch": 0.208, + "grad_norm": 0.44773052860815865, + "learning_rate": 0.00018386705679454242, + "loss": 0.942, + "step": 13 + }, + { + "epoch": 0.224, + "grad_norm": 0.4107660930492455, + "learning_rate": 0.00018090169943749476, + "loss": 0.9426, + "step": 14 + }, + { + "epoch": 0.24, + "grad_norm": 0.4433638296469529, + "learning_rate": 0.0001777145961456971, + "loss": 0.9291, + "step": 15 + }, + { + "epoch": 0.256, + "grad_norm": 0.44739182714973635, + "learning_rate": 0.00017431448254773944, + "loss": 0.9804, + "step": 16 + }, + { + "epoch": 0.272, + "grad_norm": 0.3916367426859159, + "learning_rate": 0.00017071067811865476, + "loss": 0.8843, + "step": 17 + }, + { + "epoch": 0.288, + "grad_norm": 0.3618881232704661, + "learning_rate": 0.00016691306063588583, + "loss": 0.8707, + "step": 18 + }, + { + "epoch": 0.304, + "grad_norm": 0.3538209495683395, + "learning_rate": 0.00016293203910498376, + "loss": 0.8415, + "step": 19 + }, + { + "epoch": 0.32, + "grad_norm": 0.4145025403872177, + "learning_rate": 0.00015877852522924732, + "loss": 0.8846, + "step": 20 + }, + { + "epoch": 0.336, + "grad_norm": 0.4724154080257005, + "learning_rate": 0.00015446390350150273, + "loss": 0.8455, + "step": 21 + }, + { + "epoch": 0.352, + "grad_norm": 0.3422466750710581, + "learning_rate": 0.00015000000000000001, + "loss": 0.8319, + "step": 22 + }, + { + "epoch": 0.368, + "grad_norm": 0.44464519689398796, + "learning_rate": 0.00014539904997395468, + "loss": 0.864, + "step": 23 + }, + { + "epoch": 0.384, + "grad_norm": 0.3791084014576181, + "learning_rate": 0.00014067366430758004, + "loss": 0.8666, + "step": 24 + }, + { + "epoch": 0.4, + "grad_norm": 0.38043102645498195, + "learning_rate": 0.00013583679495453, + "loss": 0.8567, + "step": 25 + }, + { + "epoch": 0.416, + "grad_norm": 0.4175641638088068, + "learning_rate": 0.00013090169943749476, + "loss": 0.8739, + "step": 26 + }, + { + "epoch": 0.432, + "grad_norm": 0.35545734698454556, + "learning_rate": 0.00012588190451025207, + "loss": 0.8384, + "step": 27 + }, + { + "epoch": 0.448, + "grad_norm": 0.41690406097653954, + "learning_rate": 0.00012079116908177593, + "loss": 0.8525, + "step": 28 + }, + { + "epoch": 0.464, + "grad_norm": 0.4062190743040883, + "learning_rate": 0.0001156434465040231, + "loss": 0.8763, + "step": 29 + }, + { + "epoch": 0.48, + "grad_norm": 0.3363888359929472, + "learning_rate": 0.00011045284632676536, + "loss": 0.844, + "step": 30 + }, + { + "epoch": 0.496, + "grad_norm": 0.3930958740166345, + "learning_rate": 0.0001052335956242944, + "loss": 0.841, + "step": 31 + }, + { + "epoch": 0.512, + "grad_norm": 0.37847718532763863, + "learning_rate": 0.0001, + "loss": 0.8293, + "step": 32 + }, + { + "epoch": 0.528, + "grad_norm": 0.3219461813331027, + "learning_rate": 9.476640437570562e-05, + "loss": 0.8349, + "step": 33 + }, + { + "epoch": 0.544, + "grad_norm": 0.3661640714738879, + "learning_rate": 8.954715367323468e-05, + "loss": 0.8533, + "step": 34 + }, + { + "epoch": 0.56, + "grad_norm": 0.38591513795082677, + "learning_rate": 8.435655349597689e-05, + "loss": 0.8526, + "step": 35 + }, + { + "epoch": 0.576, + "grad_norm": 0.36236104928671203, + "learning_rate": 7.920883091822408e-05, + "loss": 0.9167, + "step": 36 + }, + { + "epoch": 0.592, + "grad_norm": 0.4145275115654983, + "learning_rate": 7.411809548974792e-05, + "loss": 0.8045, + "step": 37 + }, + { + "epoch": 0.608, + "grad_norm": 0.4369453021225843, + "learning_rate": 6.909830056250527e-05, + "loss": 0.8479, + "step": 38 + }, + { + "epoch": 0.624, + "grad_norm": 0.37218240505222444, + "learning_rate": 6.416320504546997e-05, + "loss": 0.8621, + "step": 39 + }, + { + "epoch": 0.64, + "grad_norm": 0.3429280973786549, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.8107, + "step": 40 + }, + { + "epoch": 0.656, + "grad_norm": 0.35109691547833866, + "learning_rate": 5.4600950026045326e-05, + "loss": 0.836, + "step": 41 + }, + { + "epoch": 0.672, + "grad_norm": 0.4341433309538033, + "learning_rate": 5.000000000000002e-05, + "loss": 0.8351, + "step": 42 + }, + { + "epoch": 0.688, + "grad_norm": 0.31035920324535987, + "learning_rate": 4.5536096498497295e-05, + "loss": 0.7654, + "step": 43 + }, + { + "epoch": 0.704, + "grad_norm": 0.37521399138917083, + "learning_rate": 4.12214747707527e-05, + "loss": 0.8292, + "step": 44 + }, + { + "epoch": 0.72, + "grad_norm": 0.3411541590331352, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.8334, + "step": 45 + }, + { + "epoch": 0.736, + "grad_norm": 0.44473950816153507, + "learning_rate": 3.308693936411421e-05, + "loss": 0.7954, + "step": 46 + }, + { + "epoch": 0.752, + "grad_norm": 0.3398517801939119, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7537, + "step": 47 + }, + { + "epoch": 0.768, + "grad_norm": 0.3483422078250861, + "learning_rate": 2.5685517452260567e-05, + "loss": 0.7816, + "step": 48 + }, + { + "epoch": 0.784, + "grad_norm": 0.49908330553390123, + "learning_rate": 2.2285403854302912e-05, + "loss": 0.8616, + "step": 49 + }, + { + "epoch": 0.8, + "grad_norm": 0.32219378472981447, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.7809, + "step": 50 + }, + { + "epoch": 0.816, + "grad_norm": 0.3576051040484641, + "learning_rate": 1.6132943205457606e-05, + "loss": 0.8512, + "step": 51 + }, + { + "epoch": 0.832, + "grad_norm": 0.3735596853921932, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7639, + "step": 52 + }, + { + "epoch": 0.848, + "grad_norm": 0.3560450371172911, + "learning_rate": 1.0899347581163221e-05, + "loss": 0.8724, + "step": 53 + }, + { + "epoch": 0.864, + "grad_norm": 0.29511183321492873, + "learning_rate": 8.645454235739903e-06, + "loss": 0.7966, + "step": 54 + }, + { + "epoch": 0.88, + "grad_norm": 0.44031345535426497, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.7795, + "step": 55 + }, + { + "epoch": 0.896, + "grad_norm": 0.3309328569866633, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.7844, + "step": 56 + }, + { + "epoch": 0.912, + "grad_norm": 0.3603181651932308, + "learning_rate": 3.40741737109318e-06, + "loss": 0.8482, + "step": 57 + }, + { + "epoch": 0.928, + "grad_norm": 0.466040489343428, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.8068, + "step": 58 + }, + { + "epoch": 0.944, + "grad_norm": 0.30075922405904837, + "learning_rate": 1.231165940486234e-06, + "loss": 0.7587, + "step": 59 + }, + { + "epoch": 0.96, + "grad_norm": 0.3538523629140039, + "learning_rate": 5.478104631726711e-07, + "loss": 0.8258, + "step": 60 + }, + { + "epoch": 0.976, + "grad_norm": 0.3569845319895022, + "learning_rate": 1.3704652454261668e-07, + "loss": 0.8445, + "step": 61 + }, + { + "epoch": 0.992, + "grad_norm": 0.2949839015637882, + "learning_rate": 0.0, + "loss": 0.7312, + "step": 62 + }, + { + "epoch": 0.992, + "step": 62, + "total_flos": 156705435025408.0, + "train_loss": 0.877654165990891, + "train_runtime": 1892.0343, + "train_samples_per_second": 1.057, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 62, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 156705435025408.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ea468be95b86557f25bc23a1f4720bbd1de0bf29 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "o_proj", + "k_proj", + "q_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9609247b12e0beaa02adc29ca7fe922ff26f9f81 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32eb43786b7fabe1f87e1fef21f803622ba7d21ad538e091c646d44679c373a6 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2a5f37bdc5952bfadf2a7121a1723b23fb31207e --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b66f972032a2f1b66440386c612ad56a7754415649a955e982b27b3d64eecde +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..72283c922ee259fe9289169468447bbb978ffd40 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.9085631201505874, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.3821, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.068586851219567, + "learning_rate": 7.017543859649123e-06, + "loss": 1.4152, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 1.1395277735386191, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5638, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 1.0135378511790643, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.4522, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.80918697698487, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.2903, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9639299157020399, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4671, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.9027715291571133, + "learning_rate": 2.456140350877193e-05, + "loss": 1.2785, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.9988954920089032, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.2609, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8737678584771005, + "learning_rate": 3.157894736842105e-05, + "loss": 1.2049, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.8872070696199936, + "learning_rate": 3.508771929824561e-05, + "loss": 1.0834, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.8304838669952055, + "learning_rate": 3.859649122807018e-05, + "loss": 1.0387, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8864023129511615, + "learning_rate": 4.210526315789474e-05, + "loss": 1.0163, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.7752916378982887, + "learning_rate": 4.56140350877193e-05, + "loss": 0.9885, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.8762016533492555, + "learning_rate": 4.912280701754386e-05, + "loss": 1.0494, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 0.7165092903247978, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.9618, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.7726654208325324, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.9431, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.9228722088023148, + "learning_rate": 5.9649122807017544e-05, + "loss": 0.9847, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.746543578214571, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0573, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.8047522847578392, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0574, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.5964144600984687, + "learning_rate": 7.017543859649122e-05, + "loss": 0.9676, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.60045647351782, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9894, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.7442466900966471, + "learning_rate": 7.719298245614036e-05, + "loss": 0.9578, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.5321420807004554, + "learning_rate": 8.070175438596491e-05, + "loss": 0.8634, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6032438075403467, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9023, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.7980831015576652, + "learning_rate": 8.771929824561403e-05, + "loss": 1.0827, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.5555494381624063, + "learning_rate": 9.12280701754386e-05, + "loss": 0.9044, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.6266601305928058, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9779, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.5186428351623814, + "learning_rate": 9.824561403508771e-05, + "loss": 0.825, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5084666492543204, + "learning_rate": 0.0001017543859649123, + "loss": 0.8728, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.5653541946622131, + "learning_rate": 0.00010526315789473685, + "loss": 0.9396, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.4669593198331131, + "learning_rate": 0.00010877192982456141, + "loss": 0.8382, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.6007810072183271, + "learning_rate": 0.00011228070175438597, + "loss": 0.9553, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6628881782369296, + "learning_rate": 0.00011578947368421053, + "loss": 0.915, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.5895110692873703, + "learning_rate": 0.00011929824561403509, + "loss": 0.9507, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.5528181448399232, + "learning_rate": 0.00012280701754385965, + "loss": 0.8627, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.4809400678734925, + "learning_rate": 0.0001263157894736842, + "loss": 0.866, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.6669136602328961, + "learning_rate": 0.0001298245614035088, + "loss": 0.8893, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.45424161088883114, + "learning_rate": 0.00013333333333333334, + "loss": 0.8701, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.9312064245199226, + "learning_rate": 0.0001368421052631579, + "loss": 0.903, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.4876153422693338, + "learning_rate": 0.00014035087719298245, + "loss": 0.8838, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.5678233493416778, + "learning_rate": 0.00014385964912280703, + "loss": 0.8759, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5465196824018712, + "learning_rate": 0.00014736842105263158, + "loss": 0.848, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.5273025747227208, + "learning_rate": 0.00015087719298245616, + "loss": 0.9227, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.5067360005902175, + "learning_rate": 0.0001543859649122807, + "loss": 0.8986, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.47579307220958644, + "learning_rate": 0.00015789473684210527, + "loss": 0.8301, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.460989520919059, + "learning_rate": 0.00016140350877192982, + "loss": 0.821, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.5989674101088642, + "learning_rate": 0.0001649122807017544, + "loss": 0.8319, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5960449532932831, + "learning_rate": 0.00016842105263157895, + "loss": 0.9335, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.54580761575243, + "learning_rate": 0.00017192982456140353, + "loss": 0.8667, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.5038509422201911, + "learning_rate": 0.00017543859649122806, + "loss": 0.8917, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5679216690440128, + "learning_rate": 0.00017894736842105264, + "loss": 0.8189, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.5871413820404406, + "learning_rate": 0.0001824561403508772, + "loss": 0.8273, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.4889911646751289, + "learning_rate": 0.00018596491228070177, + "loss": 0.8638, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5489075681198782, + "learning_rate": 0.00018947368421052632, + "loss": 0.8531, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.45848667873814786, + "learning_rate": 0.00019298245614035088, + "loss": 0.8195, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.4868086323108441, + "learning_rate": 0.00019649122807017543, + "loss": 0.8177, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.5498866339393426, + "learning_rate": 0.0002, + "loss": 0.9293, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.4392085273854709, + "learning_rate": 0.00019999985069241055, + "loss": 0.7674, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.4883096165410377, + "learning_rate": 0.00019999940277008808, + "loss": 0.7862, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.564816874703866, + "learning_rate": 0.00019999865623437013, + "loss": 0.8073, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.49496782164668834, + "learning_rate": 0.00019999761108748597, + "loss": 0.7516, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.5796277324241305, + "learning_rate": 0.00019999626733255662, + "loss": 0.9386, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5380688214470547, + "learning_rate": 0.00019999462497359466, + "loss": 0.9645, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.5143663475650041, + "learning_rate": 0.00019999268401550447, + "loss": 0.8644, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.6383979690564672, + "learning_rate": 0.000199990444464082, + "loss": 0.8857, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.42852494697972915, + "learning_rate": 0.00019998790632601496, + "loss": 0.7589, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.4998944058773902, + "learning_rate": 0.00019998506960888256, + "loss": 0.8256, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5793129631947758, + "learning_rate": 0.00019998193432115572, + "loss": 0.8614, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.713890759559541, + "learning_rate": 0.0001999785004721968, + "loss": 0.8407, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.5369980492105534, + "learning_rate": 0.00019997476807225985, + "loss": 0.8322, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.4529369284224218, + "learning_rate": 0.0001999707371324904, + "loss": 0.7193, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5690506518321555, + "learning_rate": 0.00019996640766492543, + "loss": 0.8708, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.5521579446525727, + "learning_rate": 0.00019996177968249334, + "loss": 0.7786, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.5336067285134041, + "learning_rate": 0.0001999568531990141, + "loss": 0.8664, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.5518599034723031, + "learning_rate": 0.00019995162822919883, + "loss": 0.9068, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.6877374072176633, + "learning_rate": 0.00019994610478865011, + "loss": 0.9134, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.4970737269568522, + "learning_rate": 0.0001999402828938618, + "loss": 0.8253, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5219763125913343, + "learning_rate": 0.00019993416256221895, + "loss": 0.8578, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.4674974856082471, + "learning_rate": 0.00019992774381199778, + "loss": 0.8279, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.45539256115663496, + "learning_rate": 0.00019992102666236566, + "loss": 0.7985, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4777055756746316, + "learning_rate": 0.00019991401113338104, + "loss": 0.8642, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.5450178037063758, + "learning_rate": 0.00019990669724599336, + "loss": 0.8814, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.4614070714159011, + "learning_rate": 0.00019989908502204292, + "loss": 0.8954, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5573297368971848, + "learning_rate": 0.00019989117448426108, + "loss": 0.8862, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.5187820436747881, + "learning_rate": 0.00019988296565626987, + "loss": 0.8315, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.5045760148611361, + "learning_rate": 0.00019987445856258206, + "loss": 0.7794, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5545621591400848, + "learning_rate": 0.00019986565322860115, + "loss": 0.9663, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.5889943305281856, + "learning_rate": 0.00019985654968062122, + "loss": 0.7928, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.4180158904494214, + "learning_rate": 0.00019984714794582683, + "loss": 0.7492, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.5208286483969923, + "learning_rate": 0.00019983744805229296, + "loss": 0.8654, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.5379832421251245, + "learning_rate": 0.000199827450028985, + "loss": 0.9188, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.4178084704556727, + "learning_rate": 0.00019981715390575858, + "loss": 0.7925, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.6423083548955081, + "learning_rate": 0.00019980655971335945, + "loss": 0.8799, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.4363063954138688, + "learning_rate": 0.00019979566748342347, + "loss": 0.7833, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.4492942195522816, + "learning_rate": 0.00019978447724847652, + "loss": 0.7223, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.40651863801450383, + "learning_rate": 0.00019977298904193437, + "loss": 0.7098, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.5485179420244607, + "learning_rate": 0.00019976120289810247, + "loss": 0.8244, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.5216996443685623, + "learning_rate": 0.00019974911885217608, + "loss": 0.8438, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4892007208218577, + "learning_rate": 0.00019973673694024, + "loss": 0.813, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.5623388944452945, + "learning_rate": 0.0001997240571992685, + "loss": 0.7848, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.4975467207735736, + "learning_rate": 0.00019971107966712518, + "loss": 0.8596, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5020233346000809, + "learning_rate": 0.00019969780438256293, + "loss": 0.7924, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.509802842567309, + "learning_rate": 0.0001996842313852238, + "loss": 0.8483, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.4752531543766072, + "learning_rate": 0.00019967036071563877, + "loss": 0.7753, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.5400673395064166, + "learning_rate": 0.0001996561924152278, + "loss": 0.8772, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.5281212630791382, + "learning_rate": 0.0001996417265262996, + "loss": 0.8371, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.45669538101918083, + "learning_rate": 0.00019962696309205148, + "loss": 0.8797, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5383261579870747, + "learning_rate": 0.0001996119021565693, + "loss": 0.8391, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.41927024388142425, + "learning_rate": 0.0001995965437648273, + "loss": 0.7537, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.4806245830012148, + "learning_rate": 0.00019958088796268793, + "loss": 0.8412, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.47358504691457715, + "learning_rate": 0.0001995649347969019, + "loss": 0.8281, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.5134737814762356, + "learning_rate": 0.00019954868431510764, + "loss": 0.8028, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.6156393013431956, + "learning_rate": 0.00019953213656583168, + "loss": 0.9312, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.546049450545441, + "learning_rate": 0.00019951529159848805, + "loss": 0.8717, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.46144715715128237, + "learning_rate": 0.00019949814946337838, + "loss": 0.7497, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.45456147951236814, + "learning_rate": 0.00019948071021169174, + "loss": 0.8142, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.5334480256071559, + "learning_rate": 0.00019946297389550433, + "loss": 0.8136, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.4785825495210441, + "learning_rate": 0.00019944494056777946, + "loss": 0.8399, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.4429872267481487, + "learning_rate": 0.00019942661028236745, + "loss": 0.829, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.41957670746548625, + "learning_rate": 0.00019940798309400526, + "loss": 0.7656, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.5226265400262154, + "learning_rate": 0.00019938905905831654, + "loss": 0.8057, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.45197453618568495, + "learning_rate": 0.00019936983823181132, + "loss": 0.8162, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5370842279108822, + "learning_rate": 0.0001993503206718859, + "loss": 0.858, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.5278598246578419, + "learning_rate": 0.00019933050643682269, + "loss": 0.8529, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.4307941956210138, + "learning_rate": 0.00019931039558578997, + "loss": 0.7943, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.9311392635671509, + "learning_rate": 0.00019928998817884182, + "loss": 0.8568, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.47878729794796454, + "learning_rate": 0.00019926928427691786, + "loss": 0.8028, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.5159824714235858, + "learning_rate": 0.00019924828394184306, + "loss": 0.7925, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5310097267625962, + "learning_rate": 0.00019922698723632767, + "loss": 0.7569, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.5304017620742338, + "learning_rate": 0.0001992053942239668, + "loss": 0.9217, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.477716276955551, + "learning_rate": 0.0001991835049692405, + "loss": 0.8125, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4725067764282243, + "learning_rate": 0.00019916131953751342, + "loss": 0.8805, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.6881740377890267, + "learning_rate": 0.0001991388379950346, + "loss": 0.887, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.4875043852244181, + "learning_rate": 0.0001991160604089374, + "loss": 0.782, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.5130414160416645, + "learning_rate": 0.00019909298684723904, + "loss": 0.7694, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.5391783404356717, + "learning_rate": 0.00019906961737884077, + "loss": 0.8577, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.5712372275179636, + "learning_rate": 0.00019904595207352737, + "loss": 0.7942, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.45363887852434787, + "learning_rate": 0.00019902199100196697, + "loss": 0.839, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.5545661226467888, + "learning_rate": 0.000198997734235711, + "loss": 0.9437, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.595253959791159, + "learning_rate": 0.00019897318184719385, + "loss": 0.8857, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.49420251696911627, + "learning_rate": 0.00019894833390973266, + "loss": 0.801, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.5828222082646621, + "learning_rate": 0.0001989231904975272, + "loss": 0.8431, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.456940614648609, + "learning_rate": 0.00019889775168565943, + "loss": 0.7611, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5262527757792694, + "learning_rate": 0.00019887201755009357, + "loss": 0.7483, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.5489396698689593, + "learning_rate": 0.00019884598816767563, + "loss": 0.8635, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.5121258488880777, + "learning_rate": 0.0001988196636161333, + "loss": 0.7931, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.509625436737974, + "learning_rate": 0.0001987930439740757, + "loss": 0.8112, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.579146513596682, + "learning_rate": 0.00019876612932099308, + "loss": 0.9082, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.5797729980185812, + "learning_rate": 0.0001987389197372567, + "loss": 0.8965, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.45670114935471384, + "learning_rate": 0.00019871141530411853, + "loss": 0.8003, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.4443985588243516, + "learning_rate": 0.00019868361610371097, + "loss": 0.7844, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.5746325961849151, + "learning_rate": 0.00019865552221904665, + "loss": 0.9032, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4566250635842938, + "learning_rate": 0.0001986271337340182, + "loss": 0.7799, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.4908818510619844, + "learning_rate": 0.00019859845073339787, + "loss": 0.7718, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.416211607415965, + "learning_rate": 0.00019856947330283752, + "loss": 0.7958, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.631855768937812, + "learning_rate": 0.00019854020152886814, + "loss": 0.9386, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.4724998311690075, + "learning_rate": 0.0001985106354988997, + "loss": 0.814, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.4803535727141574, + "learning_rate": 0.00019848077530122083, + "loss": 0.7836, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5313118301780214, + "learning_rate": 0.0001984506210249986, + "loss": 0.8319, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.5191932761602162, + "learning_rate": 0.00019842017276027832, + "loss": 0.8023, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.4806050779256414, + "learning_rate": 0.00019838943059798304, + "loss": 0.7554, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.40209591541789275, + "learning_rate": 0.00019835839462991361, + "loss": 0.7309, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.4604431057839949, + "learning_rate": 0.0001983270649487481, + "loss": 0.7839, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.4769642986057626, + "learning_rate": 0.0001982954416480417, + "loss": 0.7833, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.422729784192873, + "learning_rate": 0.00019826352482222638, + "loss": 0.762, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.44142648781893834, + "learning_rate": 0.00019823131456661063, + "loss": 0.7934, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.5541096201508515, + "learning_rate": 0.00019819881097737915, + "loss": 0.8321, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5422485307509299, + "learning_rate": 0.00019816601415159263, + "loss": 0.8839, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.5227081457902739, + "learning_rate": 0.00019813292418718732, + "loss": 0.8218, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.534815575976312, + "learning_rate": 0.0001980995411829749, + "loss": 0.7774, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5403203503391552, + "learning_rate": 0.0001980658652386421, + "loss": 0.7982, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.576693983122081, + "learning_rate": 0.0001980318964547504, + "loss": 0.9144, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.4169461755825807, + "learning_rate": 0.0001979976349327357, + "loss": 0.8474, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4870863960739979, + "learning_rate": 0.00019796308077490817, + "loss": 0.8239, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.5749497155337887, + "learning_rate": 0.00019792823408445174, + "loss": 0.8456, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.5064223745333338, + "learning_rate": 0.0001978930949654239, + "loss": 0.9018, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.5939486595032151, + "learning_rate": 0.00019785766352275542, + "loss": 0.9005, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.49815470909038534, + "learning_rate": 0.00019782193986224995, + "loss": 0.8434, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.481621086254783, + "learning_rate": 0.00019778592409058378, + "loss": 0.8234, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.505841358462432, + "learning_rate": 0.00019774961631530545, + "loss": 0.7911, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.4751337016096947, + "learning_rate": 0.0001977130166448355, + "loss": 0.8224, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.4888071344716639, + "learning_rate": 0.00019767612518846608, + "loss": 0.8283, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.42852639924395564, + "learning_rate": 0.00019763894205636072, + "loss": 0.7659, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.4894706337105787, + "learning_rate": 0.00019760146735955388, + "loss": 0.8146, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.47995694376677095, + "learning_rate": 0.00019756370120995066, + "loss": 0.7677, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.40956111264017714, + "learning_rate": 0.00019752564372032657, + "loss": 0.7849, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.5005873859350766, + "learning_rate": 0.000197487295004327, + "loss": 0.8069, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.44969092925862897, + "learning_rate": 0.00019744865517646706, + "loss": 0.8382, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4949683388611651, + "learning_rate": 0.00019740972435213115, + "loss": 0.8437, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.558226216988814, + "learning_rate": 0.0001973705026475726, + "loss": 0.7963, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.45001350872696166, + "learning_rate": 0.00019733099017991341, + "loss": 0.7551, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4928566894907739, + "learning_rate": 0.00019729118706714375, + "loss": 0.7632, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.43447746260199227, + "learning_rate": 0.0001972510934281218, + "loss": 0.7922, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.40950198847704977, + "learning_rate": 0.00019721070938257324, + "loss": 0.7839, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.451123103338457, + "learning_rate": 0.00019717003505109095, + "loss": 0.8374, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.5638232141960325, + "learning_rate": 0.0001971290705551347, + "loss": 0.8671, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.4926491803108822, + "learning_rate": 0.00019708781601703065, + "loss": 0.828, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.417058057741698, + "learning_rate": 0.00019704627155997108, + "loss": 0.7885, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.5193930068953387, + "learning_rate": 0.00019700443730801413, + "loss": 0.8408, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.6467116680706188, + "learning_rate": 0.00019696231338608316, + "loss": 0.9473, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.4807857740866099, + "learning_rate": 0.00019691989991996663, + "loss": 0.8567, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.444645386230921, + "learning_rate": 0.00019687719703631755, + "loss": 0.8155, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.605533417685553, + "learning_rate": 0.00019683420486265327, + "loss": 0.8157, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5625649255755447, + "learning_rate": 0.0001967909235273549, + "loss": 0.9247, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.5199827137553125, + "learning_rate": 0.0001967473531596671, + "loss": 0.8048, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.42294231496595297, + "learning_rate": 0.0001967034938896976, + "loss": 0.755, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5387133298218557, + "learning_rate": 0.00019665934584841682, + "loss": 0.8904, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.4930695402459019, + "learning_rate": 0.0001966149091676575, + "loss": 0.8664, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.4987578810937898, + "learning_rate": 0.00019657018398011434, + "loss": 0.7706, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.46922494214437366, + "learning_rate": 0.00019652517041934356, + "loss": 0.783, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.5742260794727807, + "learning_rate": 0.00019647986861976246, + "loss": 0.8679, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.42914800487842647, + "learning_rate": 0.0001964342787166491, + "loss": 0.8216, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.42547217981059254, + "learning_rate": 0.00019638840084614182, + "loss": 0.7727, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.5358386531443239, + "learning_rate": 0.0001963422351452389, + "loss": 0.787, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.40809276943976613, + "learning_rate": 0.0001962957817517982, + "loss": 0.7589, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4424225122688686, + "learning_rate": 0.00019624904080453655, + "loss": 0.7482, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.5158570076959549, + "learning_rate": 0.00019620201244302952, + "loss": 0.8352, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.5310650627081238, + "learning_rate": 0.00019615469680771096, + "loss": 0.8178, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.47431280993029923, + "learning_rate": 0.00019610709403987246, + "loss": 0.8207, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.5369714632768122, + "learning_rate": 0.00019605920428166323, + "loss": 0.8084, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.5593803465684531, + "learning_rate": 0.00019601102767608923, + "loss": 0.9041, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5610667976183104, + "learning_rate": 0.00019596256436701324, + "loss": 0.8802, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.7061778966452937, + "learning_rate": 0.00019591381449915397, + "loss": 0.8686, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.45701406729521243, + "learning_rate": 0.00019586477821808597, + "loss": 0.7871, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.5166685942247737, + "learning_rate": 0.000195815455670239, + "loss": 0.8284, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.6590093338992272, + "learning_rate": 0.00019576584700289768, + "loss": 0.894, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.4459829543325769, + "learning_rate": 0.00019571595236420102, + "loss": 0.8063, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.41774887674303934, + "learning_rate": 0.00019566577190314197, + "loss": 0.7991, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.5112565978792027, + "learning_rate": 0.00019561530576956703, + "loss": 0.7925, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.7507651170717319, + "learning_rate": 0.00019556455411417573, + "loss": 0.7663, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.5510335315201351, + "learning_rate": 0.0001955135170885202, + "loss": 0.8079, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.43242819507526936, + "learning_rate": 0.00019546219484500475, + "loss": 0.7853, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.4938371182719445, + "learning_rate": 0.00019541058753688538, + "loss": 0.8019, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4253874813934552, + "learning_rate": 0.00019535869531826937, + "loss": 0.7361, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.43124181324287203, + "learning_rate": 0.00019530651834411474, + "loss": 0.7265, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.5106344521185178, + "learning_rate": 0.00019525405677022989, + "loss": 0.7504, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5402561312252943, + "learning_rate": 0.00019520131075327298, + "loss": 0.8647, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.4606947598104532, + "learning_rate": 0.0001951482804507517, + "loss": 0.772, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.4397113395758952, + "learning_rate": 0.00019509496602102252, + "loss": 0.7404, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.573784095147914, + "learning_rate": 0.00019504136762329047, + "loss": 0.8604, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.4922693165855001, + "learning_rate": 0.00019498748541760846, + "loss": 0.8037, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.4415770650690034, + "learning_rate": 0.0001949333195648769, + "loss": 0.7467, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.7202700055785279, + "learning_rate": 0.00019487887022684336, + "loss": 0.8997, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.46620151369209545, + "learning_rate": 0.00019482413756610173, + "loss": 0.721, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.48374808497981153, + "learning_rate": 0.0001947691217460921, + "loss": 0.7327, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.46228533989157017, + "learning_rate": 0.00019471382293110003, + "loss": 0.813, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.48389874544582195, + "learning_rate": 0.00019465824128625617, + "loss": 0.8091, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.4495867885666229, + "learning_rate": 0.00019460237697753577, + "loss": 0.7903, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4943482100204703, + "learning_rate": 0.00019454623017175812, + "loss": 0.8486, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.39190517051078805, + "learning_rate": 0.00019448980103658613, + "loss": 0.7646, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.46164485820158224, + "learning_rate": 0.0001944330897405257, + "loss": 0.8112, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5599512420613735, + "learning_rate": 0.00019437609645292546, + "loss": 0.8938, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.4994115747001175, + "learning_rate": 0.00019431882134397598, + "loss": 0.8471, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.44477486433110613, + "learning_rate": 0.00019426126458470936, + "loss": 0.7916, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.4864415309029597, + "learning_rate": 0.0001942034263469989, + "loss": 0.8253, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.5150608337350671, + "learning_rate": 0.00019414530680355837, + "loss": 0.8124, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.4123466909003194, + "learning_rate": 0.00019408690612794148, + "loss": 0.7501, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4968072417994427, + "learning_rate": 0.00019402822449454153, + "loss": 0.7999, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.5521519947033275, + "learning_rate": 0.00019396926207859084, + "loss": 0.883, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.49723016490414984, + "learning_rate": 0.0001939100190561601, + "loss": 0.8368, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5706367109724512, + "learning_rate": 0.00019385049560415794, + "loss": 0.8496, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.4222133193599781, + "learning_rate": 0.0001937906919003304, + "loss": 0.7991, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.5061623424371642, + "learning_rate": 0.00019373060812326052, + "loss": 0.7889, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5372415116588385, + "learning_rate": 0.00019367024445236754, + "loss": 0.7752, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.4303635042507931, + "learning_rate": 0.00019360960106790643, + "loss": 0.7048, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.46242534095827137, + "learning_rate": 0.0001935486781509677, + "loss": 0.8492, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.470057726299895, + "learning_rate": 0.00019348747588347637, + "loss": 0.7727, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.6738560257297568, + "learning_rate": 0.00019342599444819168, + "loss": 0.8035, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.45484972653733813, + "learning_rate": 0.00019336423402870653, + "loss": 0.7347, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.4165893434189079, + "learning_rate": 0.00019330219480944694, + "loss": 0.7177, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.47973820077673446, + "learning_rate": 0.0001932398769756714, + "loss": 0.8863, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.5063715481565947, + "learning_rate": 0.0001931772807134704, + "loss": 0.7737, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.48835833856488436, + "learning_rate": 0.00019311440620976597, + "loss": 0.8563, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.4395440943990699, + "learning_rate": 0.00019305125365231084, + "loss": 0.7804, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.4979492435398362, + "learning_rate": 0.00019298782322968815, + "loss": 0.7873, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.41945522453959166, + "learning_rate": 0.0001929241151313108, + "loss": 0.7641, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.5756452943189352, + "learning_rate": 0.0001928601295474208, + "loss": 0.8103, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.5278354578833819, + "learning_rate": 0.00019279586666908884, + "loss": 0.7678, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.5187228675688395, + "learning_rate": 0.00019273132668821364, + "loss": 0.7779, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.5084344970815391, + "learning_rate": 0.00019266650979752136, + "loss": 0.8791, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.43249457702628513, + "learning_rate": 0.00019260141619056507, + "loss": 0.724, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.8073685700016736, + "learning_rate": 0.00019253604606172417, + "loss": 0.9622, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.4914654636072854, + "learning_rate": 0.0001924703996062038, + "loss": 0.8337, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.48357646647529584, + "learning_rate": 0.0001924044770200342, + "loss": 0.7225, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.4310719779438626, + "learning_rate": 0.00019233827850007027, + "loss": 0.8143, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.393733200337738, + "learning_rate": 0.0001922718042439908, + "loss": 0.7469, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.46631053413694085, + "learning_rate": 0.000192205054450298, + "loss": 0.8536, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5267547426065811, + "learning_rate": 0.00019213802931831696, + "loss": 0.7167, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.5298582908589178, + "learning_rate": 0.00019207072904819486, + "loss": 0.8328, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.4962593380924952, + "learning_rate": 0.00019200315384090044, + "loss": 0.8272, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.47633365700766406, + "learning_rate": 0.00019193530389822363, + "loss": 0.7841, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.6184731909045871, + "learning_rate": 0.00019186717942277462, + "loss": 0.8716, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.5423682614680138, + "learning_rate": 0.00019179878061798347, + "loss": 0.8162, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4753038045880565, + "learning_rate": 0.00019173010768809933, + "loss": 0.7919, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.43126625082440967, + "learning_rate": 0.00019166116083819002, + "loss": 0.7528, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.5098174762604958, + "learning_rate": 0.00019159194027414128, + "loss": 0.7522, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4521083102475084, + "learning_rate": 0.0001915224462026563, + "loss": 0.7991, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.4491062878449407, + "learning_rate": 0.00019145267883125482, + "loss": 0.7777, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.437593424681035, + "learning_rate": 0.00019138263836827288, + "loss": 0.763, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.5399001237902789, + "learning_rate": 0.00019131232502286188, + "loss": 0.7841, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.4954818390198195, + "learning_rate": 0.00019124173900498818, + "loss": 0.7606, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.4880227953266813, + "learning_rate": 0.00019117088052543233, + "loss": 0.8003, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4714495974948282, + "learning_rate": 0.0001910997497957885, + "loss": 0.8491, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.4139041793775968, + "learning_rate": 0.00019102834702846387, + "loss": 0.7971, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.4376309534083468, + "learning_rate": 0.0001909566724366779, + "loss": 0.7318, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4970619286626978, + "learning_rate": 0.00019088472623446183, + "loss": 0.8498, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.4405073792264246, + "learning_rate": 0.00019081250863665794, + "loss": 0.8222, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.4159170308634145, + "learning_rate": 0.0001907400198589189, + "loss": 0.7393, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.4918985235769081, + "learning_rate": 0.00019066726011770726, + "loss": 0.8494, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.43915509097101807, + "learning_rate": 0.00019059422963029464, + "loss": 0.7508, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.4262514693238682, + "learning_rate": 0.0001905209286147611, + "loss": 0.7063, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.44518080468138016, + "learning_rate": 0.0001904473572899947, + "loss": 0.7023, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.44798453783013126, + "learning_rate": 0.0001903735158756905, + "loss": 0.8001, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.5221309013695491, + "learning_rate": 0.0001902994045923502, + "loss": 0.763, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.5015011598188029, + "learning_rate": 0.00019022502366128135, + "loss": 0.7695, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.4717158377294084, + "learning_rate": 0.0001901503733045967, + "loss": 0.7668, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.519902869646817, + "learning_rate": 0.00019007545374521355, + "loss": 0.8886, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.40475351672985693, + "learning_rate": 0.00019000026520685302, + "loss": 0.7456, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.47398524364066114, + "learning_rate": 0.00018992480791403958, + "loss": 0.7906, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.5131552000446418, + "learning_rate": 0.0001898490820921001, + "loss": 0.7802, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4865027391662875, + "learning_rate": 0.0001897730879671634, + "loss": 0.7945, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.4072220371863116, + "learning_rate": 0.0001896968257661595, + "loss": 0.7384, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.5496872257803109, + "learning_rate": 0.00018962029571681886, + "loss": 0.8269, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5145981423411348, + "learning_rate": 0.00018954349804767184, + "loss": 0.6772, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.4821094201599681, + "learning_rate": 0.00018946643298804793, + "loss": 0.7669, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.4761904012202571, + "learning_rate": 0.00018938910076807513, + "loss": 0.756, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.5751089208138552, + "learning_rate": 0.00018931150161867916, + "loss": 0.8186, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.5357565464675319, + "learning_rate": 0.0001892336357715829, + "loss": 0.7737, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.44306512734774434, + "learning_rate": 0.0001891555034593055, + "loss": 0.79, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.4441691047508839, + "learning_rate": 0.00018907710491516199, + "loss": 0.812, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.41442330109161496, + "learning_rate": 0.00018899844037326225, + "loss": 0.7088, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.45195857961777847, + "learning_rate": 0.0001889195100685106, + "loss": 0.7549, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4611292039707245, + "learning_rate": 0.0001888403142366049, + "loss": 0.7933, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.47194603710974975, + "learning_rate": 0.00018876085311403593, + "loss": 0.7678, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.4181099450281632, + "learning_rate": 0.00018868112693808665, + "loss": 0.8026, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4039528236321422, + "learning_rate": 0.00018860113594683148, + "loss": 0.7123, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.43416047403619185, + "learning_rate": 0.00018852088037913577, + "loss": 0.7105, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.4122624485330412, + "learning_rate": 0.0001884403604746547, + "loss": 0.726, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5140401902686076, + "learning_rate": 0.00018835957647383303, + "loss": 0.8084, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.4081266792459306, + "learning_rate": 0.00018827852861790398, + "loss": 0.7446, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.4610111592380031, + "learning_rate": 0.00018819721714888877, + "loss": 0.7713, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5000624203011732, + "learning_rate": 0.00018811564230959588, + "loss": 0.7945, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.5525391954986797, + "learning_rate": 0.00018803380434362, + "loss": 0.7058, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.4935700600524609, + "learning_rate": 0.0001879517034953418, + "loss": 0.8355, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.5179993188442348, + "learning_rate": 0.00018786934000992688, + "loss": 0.7575, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.45452978446209813, + "learning_rate": 0.00018778671413332513, + "loss": 0.6695, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.4721788518305318, + "learning_rate": 0.00018770382611226987, + "loss": 0.762, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4409461330278918, + "learning_rate": 0.00018762067619427746, + "loss": 0.7311, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.42867380804011745, + "learning_rate": 0.000187537264627646, + "loss": 0.7423, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.4642311275096459, + "learning_rate": 0.00018745359166145523, + "loss": 0.7603, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5211127221392075, + "learning_rate": 0.00018736965754556528, + "loss": 0.7624, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.50544600584653, + "learning_rate": 0.00018728546253061614, + "loss": 0.7266, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.48471960478633924, + "learning_rate": 0.00018720100686802694, + "loss": 0.8137, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5004536148113095, + "learning_rate": 0.00018711629080999504, + "loss": 0.8497, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.4341950230096382, + "learning_rate": 0.00018703131460949554, + "loss": 0.7625, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.4163684696163548, + "learning_rate": 0.0001869460785202802, + "loss": 0.7397, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4621245017314665, + "learning_rate": 0.00018686058279687698, + "loss": 0.7982, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.4986620999162991, + "learning_rate": 0.00018677482769458904, + "loss": 0.8684, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.6101277639283995, + "learning_rate": 0.00018668881346949417, + "loss": 0.855, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.5920423165090527, + "learning_rate": 0.00018660254037844388, + "loss": 0.9037, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.43327268782626305, + "learning_rate": 0.00018651600867906272, + "loss": 0.7659, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.45571022796915445, + "learning_rate": 0.00018642921862974742, + "loss": 0.818, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.5126591759054586, + "learning_rate": 0.00018634217048966637, + "loss": 0.8502, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.5093667930965703, + "learning_rate": 0.00018625486451875843, + "loss": 0.7239, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.4468011573800045, + "learning_rate": 0.0001861673009777325, + "loss": 0.803, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.39014778684031004, + "learning_rate": 0.0001860794801280666, + "loss": 0.7288, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.5208611631026197, + "learning_rate": 0.00018599140223200716, + "loss": 0.8064, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.5658619006275716, + "learning_rate": 0.0001859030675525681, + "loss": 0.79, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5306869958440832, + "learning_rate": 0.0001858144763535302, + "loss": 0.8517, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.41596871920773343, + "learning_rate": 0.0001857256288994402, + "loss": 0.7658, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.4769088573609008, + "learning_rate": 0.00018563652545561013, + "loss": 0.8171, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.39326204339613774, + "learning_rate": 0.0001855471662881164, + "loss": 0.6806, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.4723424029911581, + "learning_rate": 0.000185457551663799, + "loss": 0.7456, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.42068601070158923, + "learning_rate": 0.00018536768185026083, + "loss": 0.7408, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.47245125655841735, + "learning_rate": 0.00018527755711586678, + "loss": 0.7825, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.45630395670920204, + "learning_rate": 0.00018518717772974302, + "loss": 0.7221, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.45953926555663455, + "learning_rate": 0.00018509654396177609, + "loss": 0.6982, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4068974961988575, + "learning_rate": 0.00018500565608261214, + "loss": 0.7189, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.5866832999064415, + "learning_rate": 0.00018491451436365627, + "loss": 0.904, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.4332083949076997, + "learning_rate": 0.0001848231190770714, + "loss": 0.6994, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.5086504725226426, + "learning_rate": 0.00018473147049577774, + "loss": 0.8404, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.54160624793637, + "learning_rate": 0.00018463956889345194, + "loss": 0.7675, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.4638294028941344, + "learning_rate": 0.00018454741454452603, + "loss": 0.8093, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4418799635284932, + "learning_rate": 0.00018445500772418697, + "loss": 0.7754, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.4387775361332178, + "learning_rate": 0.00018436234870837547, + "loss": 0.7714, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.47276823056876205, + "learning_rate": 0.00018426943777378552, + "loss": 0.6988, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4347940653732354, + "learning_rate": 0.00018417627519786315, + "loss": 0.7045, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.4954054214191427, + "learning_rate": 0.00018408286125880604, + "loss": 0.7661, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.4070067571016391, + "learning_rate": 0.00018398919623556238, + "loss": 0.7161, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.4663594433106534, + "learning_rate": 0.00018389528040783012, + "loss": 0.7952, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.46838253955764675, + "learning_rate": 0.0001838011140560562, + "loss": 0.8083, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.5202814517890559, + "learning_rate": 0.00018370669746143564, + "loss": 0.7643, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5123534446161135, + "learning_rate": 0.00018361203090591071, + "loss": 0.8352, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.49453488048027294, + "learning_rate": 0.0001835171146721701, + "loss": 0.8083, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.4944787757352668, + "learning_rate": 0.00018342194904364813, + "loss": 0.8, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.6122735332559011, + "learning_rate": 0.00018332653430452376, + "loss": 0.7948, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.6068704476523478, + "learning_rate": 0.00018323087073971993, + "loss": 0.8312, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.56855732304193, + "learning_rate": 0.00018313495863490258, + "loss": 0.7921, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.43206934589137125, + "learning_rate": 0.00018303879827647975, + "loss": 0.758, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.4280314397137237, + "learning_rate": 0.00018294238995160094, + "loss": 0.7688, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.44525241902820006, + "learning_rate": 0.00018284573394815597, + "loss": 0.7484, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4672559340463771, + "learning_rate": 0.00018274883055477436, + "loss": 0.7284, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.47010851180359864, + "learning_rate": 0.00018265168006082437, + "loss": 0.7738, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4347907834237806, + "learning_rate": 0.00018255428275641214, + "loss": 0.7609, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.5185870081958338, + "learning_rate": 0.00018245663893238075, + "loss": 0.7889, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.38201472047316243, + "learning_rate": 0.0001823587488803095, + "loss": 0.6871, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.5105212067074328, + "learning_rate": 0.00018226061289251298, + "loss": 0.7924, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4566184882131342, + "learning_rate": 0.00018216223126204007, + "loss": 0.7331, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.419991866669449, + "learning_rate": 0.00018206360428267332, + "loss": 0.6958, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.42825530377397397, + "learning_rate": 0.00018196473224892784, + "loss": 0.7512, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.39509894216822994, + "learning_rate": 0.00018186561545605054, + "loss": 0.6676, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.426398643286428, + "learning_rate": 0.0001817662542000192, + "loss": 0.7766, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.437866021319674, + "learning_rate": 0.0001816666487775416, + "loss": 0.7289, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4768363463629006, + "learning_rate": 0.00018156679948605467, + "loss": 0.7541, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.43166674310102465, + "learning_rate": 0.00018146670662372354, + "loss": 0.7144, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.5097188904050277, + "learning_rate": 0.0001813663704894407, + "loss": 0.7777, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.527118178931027, + "learning_rate": 0.00018126579138282503, + "loss": 0.8253, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.46308375087316267, + "learning_rate": 0.00018116496960422107, + "loss": 0.6781, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.4265418606802498, + "learning_rate": 0.00018106390545469795, + "loss": 0.7129, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.5470685105632853, + "learning_rate": 0.0001809625992360485, + "loss": 0.754, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.5449023406053731, + "learning_rate": 0.00018086105125078857, + "loss": 0.7436, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.5002866378720431, + "learning_rate": 0.00018075926180215576, + "loss": 0.809, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.436075859668813, + "learning_rate": 0.00018065723119410884, + "loss": 0.7931, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.5547698136524428, + "learning_rate": 0.0001805549597313267, + "loss": 0.7451, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.42423881229772237, + "learning_rate": 0.0001804524477192075, + "loss": 0.7424, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.41932308479173486, + "learning_rate": 0.00018034969546386757, + "loss": 0.7207, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.44222308190052617, + "learning_rate": 0.00018024670327214084, + "loss": 0.8142, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.43376550354966625, + "learning_rate": 0.00018014347145157755, + "loss": 0.7304, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.45554808496225185, + "learning_rate": 0.0001800400003104436, + "loss": 0.7902, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.39233811953378955, + "learning_rate": 0.0001799362901577196, + "loss": 0.6922, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.5143180631551983, + "learning_rate": 0.00017983234130309968, + "loss": 0.8508, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5093664841627964, + "learning_rate": 0.00017972815405699103, + "loss": 0.7694, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.42795052337689815, + "learning_rate": 0.00017962372873051252, + "loss": 0.7434, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.41525741778223646, + "learning_rate": 0.00017951906563549397, + "loss": 0.7508, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.47323428827658764, + "learning_rate": 0.00017941416508447536, + "loss": 0.834, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.4801983649843368, + "learning_rate": 0.00017930902739070562, + "loss": 0.7548, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.5060015069434444, + "learning_rate": 0.00017920365286814183, + "loss": 0.858, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4361409062518517, + "learning_rate": 0.0001790980418314484, + "loss": 0.7244, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.49103920706529286, + "learning_rate": 0.0001789921945959958, + "loss": 0.7844, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.5653299481861203, + "learning_rate": 0.00017888611147786002, + "loss": 0.7917, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.4353872202467468, + "learning_rate": 0.00017877979279382135, + "loss": 0.7144, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.5014059158632823, + "learning_rate": 0.00017867323886136348, + "loss": 0.7122, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.5436335750211385, + "learning_rate": 0.00017856644999867264, + "loss": 0.7866, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.41721531036335957, + "learning_rate": 0.0001784594265246366, + "loss": 0.6763, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.507161918631008, + "learning_rate": 0.00017835216875884368, + "loss": 0.7603, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.5885444045544327, + "learning_rate": 0.0001782446770215819, + "loss": 0.8038, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.40265878560133256, + "learning_rate": 0.0001781369516338378, + "loss": 0.7168, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.6017448386092624, + "learning_rate": 0.00017802899291729585, + "loss": 0.7982, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.45832934136293313, + "learning_rate": 0.0001779208011943371, + "loss": 0.8608, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.41844917394248277, + "learning_rate": 0.00017781237678803847, + "loss": 0.8087, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.46507691971111825, + "learning_rate": 0.00017770372002217172, + "loss": 0.7491, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.47351688392240704, + "learning_rate": 0.00017759483122120238, + "loss": 0.8074, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4154239201061571, + "learning_rate": 0.000177485710710289, + "loss": 0.729, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.44021123134243856, + "learning_rate": 0.00017737635881528196, + "loss": 0.6933, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.5328921997969187, + "learning_rate": 0.00017726677586272263, + "loss": 0.8426, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5102523624853145, + "learning_rate": 0.00017715696217984235, + "loss": 0.7099, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.457351829008867, + "learning_rate": 0.00017704691809456143, + "loss": 0.7707, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.7127093511519104, + "learning_rate": 0.0001769366439354882, + "loss": 0.8578, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.39037585116747653, + "learning_rate": 0.00017682614003191807, + "loss": 0.7079, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.4387647959996424, + "learning_rate": 0.00017671540671383243, + "loss": 0.6804, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.5288638738165535, + "learning_rate": 0.0001766044443118978, + "loss": 0.8139, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.47115878113596116, + "learning_rate": 0.00017649325315746478, + "loss": 0.7384, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.46183812959425985, + "learning_rate": 0.00017638183358256696, + "loss": 0.7397, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.5000788732299103, + "learning_rate": 0.00017627018591992018, + "loss": 0.8112, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.48262617864872853, + "learning_rate": 0.0001761583105029213, + "loss": 0.8582, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.42645130758657335, + "learning_rate": 0.00017604620766564723, + "loss": 0.6859, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.48305386241043113, + "learning_rate": 0.00017593387774285412, + "loss": 0.7342, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5387863030614242, + "learning_rate": 0.00017582132106997616, + "loss": 0.8496, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.45387368932873007, + "learning_rate": 0.0001757085379831246, + "loss": 0.7747, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.4891518550832705, + "learning_rate": 0.00017559552881908695, + "loss": 0.7686, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.6491626825140133, + "learning_rate": 0.00017548229391532572, + "loss": 0.8211, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.40152077036105355, + "learning_rate": 0.00017536883360997743, + "loss": 0.7162, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.48088470877038086, + "learning_rate": 0.00017525514824185185, + "loss": 0.8232, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4326827443571955, + "learning_rate": 0.00017514123815043074, + "loss": 0.7644, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.4026907127984479, + "learning_rate": 0.00017502710367586687, + "loss": 0.7148, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.3808283680729642, + "learning_rate": 0.0001749127451589832, + "loss": 0.7195, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4629461315833693, + "learning_rate": 0.00017479816294127152, + "loss": 0.7329, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.47849932306356757, + "learning_rate": 0.00017468335736489177, + "loss": 0.7467, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.4321013194596791, + "learning_rate": 0.00017456832877267084, + "loss": 0.7224, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.4296075615472621, + "learning_rate": 0.0001744530775081015, + "loss": 0.7698, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.4501563585562932, + "learning_rate": 0.00017433760391534167, + "loss": 0.8284, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.4298971262426949, + "learning_rate": 0.00017422190833921283, + "loss": 0.7613, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4659220801901008, + "learning_rate": 0.0001741059911251997, + "loss": 0.7692, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.5060198184251572, + "learning_rate": 0.00017398985261944856, + "loss": 0.7403, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.40249062290001575, + "learning_rate": 0.00017387349316876666, + "loss": 0.7152, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5330978060621342, + "learning_rate": 0.000173756913120621, + "loss": 0.791, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.6214848365468837, + "learning_rate": 0.0001736401128231373, + "loss": 0.7896, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.43978605057364534, + "learning_rate": 0.00017352309262509894, + "loss": 0.7661, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5453447632069873, + "learning_rate": 0.00017340585287594604, + "loss": 0.8464, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.48435012249024784, + "learning_rate": 0.0001732883939257742, + "loss": 0.8194, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.4303372788774748, + "learning_rate": 0.0001731707161253338, + "loss": 0.6974, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.42014942047868853, + "learning_rate": 0.0001730528198260285, + "loss": 0.7546, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.37255071927968164, + "learning_rate": 0.00017293470537991463, + "loss": 0.7495, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.5001674040197623, + "learning_rate": 0.00017281637313969978, + "loss": 0.8021, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.3779890246400226, + "learning_rate": 0.00017269782345874203, + "loss": 0.7286, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.5021655118073337, + "learning_rate": 0.00017257905669104874, + "loss": 0.8506, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.45595243514127176, + "learning_rate": 0.00017246007319127545, + "loss": 0.7342, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4726357599590175, + "learning_rate": 0.00017234087331472497, + "loss": 0.74, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.4521335584835581, + "learning_rate": 0.00017222145741734626, + "loss": 0.709, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.43127508095845263, + "learning_rate": 0.00017210182585573327, + "loss": 0.7497, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.5599697811197301, + "learning_rate": 0.00017198197898712404, + "loss": 0.8137, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.3804029533265518, + "learning_rate": 0.00017186191716939944, + "loss": 0.6915, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.746183002609445, + "learning_rate": 0.0001717416407610824, + "loss": 0.7697, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5130232918519868, + "learning_rate": 0.00017162115012133643, + "loss": 0.806, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.5213460024416292, + "learning_rate": 0.00017150044560996488, + "loss": 0.8428, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.41022692879256917, + "learning_rate": 0.00017137952758740978, + "loss": 0.7383, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5578436217570287, + "learning_rate": 0.00017125839641475072, + "loss": 0.696, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.45664610602130395, + "learning_rate": 0.00017113705245370368, + "loss": 0.8532, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.5585428729991476, + "learning_rate": 0.00017101549606662024, + "loss": 0.8265, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.4244871717543694, + "learning_rate": 0.00017089372761648616, + "loss": 0.7385, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.5227791534252502, + "learning_rate": 0.00017077174746692056, + "loss": 0.8892, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.5318292679279191, + "learning_rate": 0.00017064955598217462, + "loss": 0.8573, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4615248700945455, + "learning_rate": 0.00017052715352713075, + "loss": 0.7564, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.49176912695016994, + "learning_rate": 0.00017040454046730115, + "loss": 0.8478, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.44791191354003707, + "learning_rate": 0.00017028171716882714, + "loss": 0.7504, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.45997484601856076, + "learning_rate": 0.00017015868399847768, + "loss": 0.736, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.4827218607134327, + "learning_rate": 0.00017003544132364846, + "loss": 0.7537, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.4929806803809034, + "learning_rate": 0.00016991198951236088, + "loss": 0.7408, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4618438899566801, + "learning_rate": 0.00016978832893326074, + "loss": 0.7292, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.3990480991514046, + "learning_rate": 0.00016966445995561727, + "loss": 0.6469, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.5689347089145561, + "learning_rate": 0.00016954038294932216, + "loss": 0.8524, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.42746924542558523, + "learning_rate": 0.00016941609828488807, + "loss": 0.7454, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.45646558867280534, + "learning_rate": 0.0001692916063334479, + "loss": 0.7797, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.4099030337127495, + "learning_rate": 0.0001691669074667535, + "loss": 0.7424, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.4699366544016761, + "learning_rate": 0.0001690420020571747, + "loss": 0.8001, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.48801320135287946, + "learning_rate": 0.0001689168904776979, + "loss": 0.7731, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.4729783464102199, + "learning_rate": 0.00016879157310192535, + "loss": 0.7898, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.366664477964398, + "learning_rate": 0.0001686660503040737, + "loss": 0.7197, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.6489799939556533, + "learning_rate": 0.00016854032245897308, + "loss": 0.8747, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.4004114664963537, + "learning_rate": 0.00016841438994206595, + "loss": 0.6793, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.39106408936041676, + "learning_rate": 0.00016828825312940592, + "loss": 0.7231, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.7550406323196349, + "learning_rate": 0.00016816191239765667, + "loss": 0.8428, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.5840244880545498, + "learning_rate": 0.00016803536812409075, + "loss": 0.8486, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4751770085819043, + "learning_rate": 0.0001679086206865886, + "loss": 0.7912, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.4490688524112541, + "learning_rate": 0.00016778167046363734, + "loss": 0.7203, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.43911848862092817, + "learning_rate": 0.00016765451783432953, + "loss": 0.7527, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4024482136911001, + "learning_rate": 0.00016752716317836229, + "loss": 0.7584, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.39530631455486304, + "learning_rate": 0.0001673996068760359, + "loss": 0.7077, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.3803203234642549, + "learning_rate": 0.00016727184930825288, + "loss": 0.6664, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.590917846307457, + "learning_rate": 0.0001671438908565167, + "loss": 0.8629, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.40770436203904703, + "learning_rate": 0.00016701573190293077, + "loss": 0.7312, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.4041210416957774, + "learning_rate": 0.00016688737283019706, + "loss": 0.6492, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4433940191733088, + "learning_rate": 0.00016675881402161536, + "loss": 0.7532, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.5182148914526955, + "learning_rate": 0.00016663005586108176, + "loss": 0.7671, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.4716060340008417, + "learning_rate": 0.00016650109873308765, + "loss": 0.7363, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.43474109735536554, + "learning_rate": 0.0001663719430227186, + "loss": 0.7864, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.4552286667017043, + "learning_rate": 0.0001662425891156531, + "loss": 0.7119, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.4283714125903116, + "learning_rate": 0.00016611303739816168, + "loss": 0.7459, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4399614523439693, + "learning_rate": 0.00016598328825710533, + "loss": 0.7647, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.49806828571806006, + "learning_rate": 0.00016585334207993476, + "loss": 0.8131, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.47784531729781604, + "learning_rate": 0.00016572319925468892, + "loss": 0.7075, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.43140573525010656, + "learning_rate": 0.000165592860169994, + "loss": 0.7423, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.45867146939001696, + "learning_rate": 0.0001654623252150624, + "loss": 0.7795, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.47173040602918015, + "learning_rate": 0.00016533159477969122, + "loss": 0.7149, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.4463124416158129, + "learning_rate": 0.00016520066925426144, + "loss": 0.6789, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.43562608521827223, + "learning_rate": 0.00016506954902973655, + "loss": 0.7121, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.510107104315274, + "learning_rate": 0.00016493823449766136, + "loss": 0.77, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4286178998079608, + "learning_rate": 0.0001648067260501611, + "loss": 0.6464, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.5588290670377358, + "learning_rate": 0.00016467502407993992, + "loss": 0.8497, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.4126039314109562, + "learning_rate": 0.0001645431289802799, + "loss": 0.698, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4438598521550294, + "learning_rate": 0.0001644110411450398, + "loss": 0.8173, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.49310174824619524, + "learning_rate": 0.00016427876096865394, + "loss": 0.7645, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.4827642620974618, + "learning_rate": 0.00016414628884613107, + "loss": 0.6694, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.62650152900039, + "learning_rate": 0.00016401362517305296, + "loss": 0.7612, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.4136073348459315, + "learning_rate": 0.00016388077034557355, + "loss": 0.6796, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.4156469290680636, + "learning_rate": 0.00016374772476041748, + "loss": 0.7477, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.46252910109112666, + "learning_rate": 0.00016361448881487914, + "loss": 0.7276, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.4838458879606349, + "learning_rate": 0.00016348106290682118, + "loss": 0.7985, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.41936094452170086, + "learning_rate": 0.00016334744743467364, + "loss": 0.746, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.4246631931992224, + "learning_rate": 0.00016321364279743266, + "loss": 0.7663, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.5307689308824421, + "learning_rate": 0.00016307964939465914, + "loss": 0.8089, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.4176028930825871, + "learning_rate": 0.00016294546762647775, + "loss": 0.7023, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4050874027460776, + "learning_rate": 0.0001628110978935756, + "loss": 0.7081, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.5552860031694926, + "learning_rate": 0.0001626765405972011, + "loss": 0.8476, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.5059177529622828, + "learning_rate": 0.00016254179613916278, + "loss": 0.7602, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.45647957438327774, + "learning_rate": 0.00016240686492182804, + "loss": 0.7639, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.45155865053962924, + "learning_rate": 0.000162271747348122, + "loss": 0.7365, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.522916264139643, + "learning_rate": 0.0001621364438215262, + "loss": 0.74, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.5960253694475376, + "learning_rate": 0.00016200095474607753, + "loss": 0.8734, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.5578338552813317, + "learning_rate": 0.00016186528052636692, + "loss": 0.7991, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.5563161861649676, + "learning_rate": 0.0001617294215675382, + "loss": 0.8162, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.40767014271829316, + "learning_rate": 0.00016159337827528685, + "loss": 0.7877, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.6405265025741261, + "learning_rate": 0.0001614571510558588, + "loss": 0.8762, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.446836140499563, + "learning_rate": 0.00016132074031604917, + "loss": 0.7816, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.40862925375593145, + "learning_rate": 0.0001611841464632011, + "loss": 0.7069, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.47345786974218745, + "learning_rate": 0.00016104736990520468, + "loss": 0.7699, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.3964458638261727, + "learning_rate": 0.0001609104110504954, + "loss": 0.7024, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4782487646804292, + "learning_rate": 0.0001607732703080532, + "loss": 0.7582, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.4781890518885793, + "learning_rate": 0.00016063594808740113, + "loss": 0.7751, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.4814442042715367, + "learning_rate": 0.00016049844479860422, + "loss": 0.7019, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.5199140974588942, + "learning_rate": 0.00016036076085226814, + "loss": 0.8447, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.5313990613206334, + "learning_rate": 0.00016022289665953808, + "loss": 0.7816, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.4799725877181169, + "learning_rate": 0.00016008485263209742, + "loss": 0.7832, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5266435584773811, + "learning_rate": 0.0001599466291821666, + "loss": 0.6904, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.5046991392874137, + "learning_rate": 0.0001598082267225018, + "loss": 0.7408, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.45424349902581057, + "learning_rate": 0.0001596696456663938, + "loss": 0.698, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.4793111057554418, + "learning_rate": 0.0001595308864276666, + "loss": 0.7567, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.43227414761525423, + "learning_rate": 0.00015939194942067646, + "loss": 0.7375, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.5643311648896452, + "learning_rate": 0.0001592528350603103, + "loss": 0.858, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.4412915834499803, + "learning_rate": 0.0001591135437619847, + "loss": 0.69, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.4531276391170449, + "learning_rate": 0.00015897407594164467, + "loss": 0.6854, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.4905267317151564, + "learning_rate": 0.00015883443201576225, + "loss": 0.7467, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4911187969046422, + "learning_rate": 0.0001586946124013354, + "loss": 0.7275, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.5013240404148479, + "learning_rate": 0.00015855461751588677, + "loss": 0.7718, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.46830579668224936, + "learning_rate": 0.0001584144477774623, + "loss": 0.7554, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.47951870119945444, + "learning_rate": 0.0001582741036046301, + "loss": 0.7971, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.43737643759649525, + "learning_rate": 0.00015813358541647915, + "loss": 0.7431, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.4765479525151643, + "learning_rate": 0.00015799289363261813, + "loss": 0.7483, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4851346776119451, + "learning_rate": 0.00015785202867317407, + "loss": 0.7464, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.45666414540496963, + "learning_rate": 0.00015771099095879108, + "loss": 0.7142, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.47124937687476487, + "learning_rate": 0.0001575697809106292, + "loss": 0.8063, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4512181791665397, + "learning_rate": 0.00015742839895036305, + "loss": 0.778, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.43799214729588204, + "learning_rate": 0.00015728684550018064, + "loss": 0.715, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.49942633605142783, + "learning_rate": 0.0001571451209827821, + "loss": 0.7268, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.41878555210391544, + "learning_rate": 0.00015700322582137827, + "loss": 0.6978, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.4617890713138586, + "learning_rate": 0.00015686116043968972, + "loss": 0.8091, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.5086351450883347, + "learning_rate": 0.00015671892526194516, + "loss": 0.7992, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3982048330141303, + "learning_rate": 0.0001565765207128805, + "loss": 0.7772, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.5045069965717076, + "learning_rate": 0.0001564339472177373, + "loss": 0.8574, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.4300046902832388, + "learning_rate": 0.00015629120520226165, + "loss": 0.6929, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3854888620823196, + "learning_rate": 0.0001561482950927029, + "loss": 0.6634, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.44413915656516145, + "learning_rate": 0.0001560052173158123, + "loss": 0.7947, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.5148269240369787, + "learning_rate": 0.00015586197229884184, + "loss": 0.7268, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.42614724746829896, + "learning_rate": 0.00015571856046954285, + "loss": 0.7286, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5188637044930834, + "learning_rate": 0.00015557498225616487, + "loss": 0.7432, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.4437765589378293, + "learning_rate": 0.0001554312380874542, + "loss": 0.7139, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4782643512158861, + "learning_rate": 0.00015528732839265272, + "loss": 0.7499, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.49675578497530054, + "learning_rate": 0.00015514325360149668, + "loss": 0.7929, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.522848980382239, + "learning_rate": 0.0001549990141442153, + "loss": 0.8067, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.5079536115851717, + "learning_rate": 0.0001548546104515294, + "loss": 0.7452, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.4137829685964748, + "learning_rate": 0.00015471004295465035, + "loss": 0.7043, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.453605358620488, + "learning_rate": 0.0001545653120852787, + "loss": 0.7271, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.4141635974805384, + "learning_rate": 0.00015442041827560274, + "loss": 0.6285, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.5076033525490622, + "learning_rate": 0.00015427536195829742, + "loss": 0.81, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.40817821955356365, + "learning_rate": 0.00015413014356652286, + "loss": 0.6805, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4983328302722857, + "learning_rate": 0.00015398476353392323, + "loss": 0.8039, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.4383914124946045, + "learning_rate": 0.00015383922229462549, + "loss": 0.793, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.6154682202613985, + "learning_rate": 0.00015369352028323774, + "loss": 0.8009, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4996894135065563, + "learning_rate": 0.00015354765793484834, + "loss": 0.6986, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.5342737258228484, + "learning_rate": 0.0001534016356850244, + "loss": 0.7628, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.4368134499640202, + "learning_rate": 0.0001532554539698105, + "loss": 0.7567, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4420806653535677, + "learning_rate": 0.00015310911322572753, + "loss": 0.71, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.4452477626339898, + "learning_rate": 0.00015296261388977108, + "loss": 0.7052, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.4563064594584842, + "learning_rate": 0.0001528159563994104, + "loss": 0.7361, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.45460213987624065, + "learning_rate": 0.000152669141192587, + "loss": 0.7554, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.42295115123363, + "learning_rate": 0.00015252216870771345, + "loss": 0.6675, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.5267021561008577, + "learning_rate": 0.00015237503938367186, + "loss": 0.7679, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.6540071760469934, + "learning_rate": 0.00015222775365981273, + "loss": 0.8059, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.5213193527286809, + "learning_rate": 0.00015208031197595356, + "loss": 0.7712, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.43139377295203163, + "learning_rate": 0.0001519327147723776, + "loss": 0.7263, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4316050447440885, + "learning_rate": 0.00015178496248983254, + "loss": 0.7478, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.42914517990572965, + "learning_rate": 0.0001516370555695291, + "loss": 0.7407, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.481153045607107, + "learning_rate": 0.00015148899445313981, + "loss": 0.7459, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.46811913912239944, + "learning_rate": 0.00015134077958279765, + "loss": 0.7748, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.4342823363947245, + "learning_rate": 0.00015119241140109467, + "loss": 0.7907, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.3905026412716211, + "learning_rate": 0.00015104389035108077, + "loss": 0.7316, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.42610498488986187, + "learning_rate": 0.00015089521687626243, + "loss": 0.7399, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.5034508370115129, + "learning_rate": 0.0001507463914206012, + "loss": 0.8344, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.5586391402759366, + "learning_rate": 0.0001505974144285124, + "loss": 0.74, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.37125900653608684, + "learning_rate": 0.000150448286344864, + "loss": 0.6617, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.485583458353565, + "learning_rate": 0.00015029900761497506, + "loss": 0.7774, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.4954571050155668, + "learning_rate": 0.00015014957868461458, + "loss": 0.7906, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.505070845435494, + "learning_rate": 0.00015000000000000001, + "loss": 0.6987, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.4134534293232696, + "learning_rate": 0.000149850272007796, + "loss": 0.7442, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.5263929821549653, + "learning_rate": 0.00014970039515511304, + "loss": 0.7707, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.44457344668479776, + "learning_rate": 0.00014955036988950618, + "loss": 0.7349, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.5817464789803348, + "learning_rate": 0.0001494001966589736, + "loss": 0.7679, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.5217524197580704, + "learning_rate": 0.00014924987591195547, + "loss": 0.7198, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.46930386991088263, + "learning_rate": 0.00014909940809733222, + "loss": 0.7354, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.5051126447435591, + "learning_rate": 0.0001489487936644237, + "loss": 0.6634, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.49067362493495315, + "learning_rate": 0.00014879803306298736, + "loss": 0.7956, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5424462375306275, + "learning_rate": 0.00014864712674321734, + "loss": 0.8295, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.47017969766672846, + "learning_rate": 0.00014849607515574276, + "loss": 0.7638, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.4540154713824308, + "learning_rate": 0.00014834487875162657, + "loss": 0.7648, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.4837506777958714, + "learning_rate": 0.00014819353798236427, + "loss": 0.6739, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.3934598204197847, + "learning_rate": 0.00014804205329988225, + "loss": 0.7396, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.5432294644004281, + "learning_rate": 0.00014789042515653687, + "loss": 0.6925, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4789363170771198, + "learning_rate": 0.00014773865400511272, + "loss": 0.7313, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.5357514667571863, + "learning_rate": 0.00014758674029882152, + "loss": 0.8168, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.4500945178245003, + "learning_rate": 0.00014743468449130063, + "loss": 0.7634, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.43400383552147376, + "learning_rate": 0.00014728248703661182, + "loss": 0.7586, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.4259067740107126, + "learning_rate": 0.00014713014838923976, + "loss": 0.7522, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.5105416602018407, + "learning_rate": 0.00014697766900409074, + "loss": 0.7649, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.41119635062549587, + "learning_rate": 0.00014682504933649144, + "loss": 0.6514, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.4588668418997286, + "learning_rate": 0.0001466722898421873, + "loss": 0.7102, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.47124117773297414, + "learning_rate": 0.0001465193909773413, + "loss": 0.8015, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.4658260816751191, + "learning_rate": 0.00014636635319853275, + "loss": 0.8045, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.5015444435396279, + "learning_rate": 0.00014621317696275564, + "loss": 0.7678, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.44157284696271476, + "learning_rate": 0.00014605986272741748, + "loss": 0.7543, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.48659294076982745, + "learning_rate": 0.00014590641095033787, + "loss": 0.743, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.4265474980449237, + "learning_rate": 0.00014575282208974702, + "loss": 0.7171, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.4219439573412077, + "learning_rate": 0.00014559909660428468, + "loss": 0.7029, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3855916919420435, + "learning_rate": 0.00014544523495299842, + "loss": 0.6896, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.5427974072840934, + "learning_rate": 0.00014529123759534255, + "loss": 0.7743, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.3866197168558701, + "learning_rate": 0.00014513710499117647, + "loss": 0.665, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4977247982824666, + "learning_rate": 0.0001449828376007636, + "loss": 0.7273, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.49913093342251724, + "learning_rate": 0.00014482843588476974, + "loss": 0.7092, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.39274709181532047, + "learning_rate": 0.00014467390030426186, + "loss": 0.7625, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.501820916604351, + "learning_rate": 0.0001445192313207067, + "loss": 0.7874, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.4436715480416968, + "learning_rate": 0.0001443644293959693, + "loss": 0.7617, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.4162035570116486, + "learning_rate": 0.00014420949499231172, + "loss": 0.6804, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.42380178364232374, + "learning_rate": 0.0001440544285723915, + "loss": 0.7388, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.4468586847170325, + "learning_rate": 0.00014389923059926062, + "loss": 0.7477, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.42116717568302664, + "learning_rate": 0.0001437439015363638, + "loss": 0.7642, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.43594572116323654, + "learning_rate": 0.00014358844184753712, + "loss": 0.753, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.41700576251172683, + "learning_rate": 0.00014343285199700683, + "loss": 0.7619, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.49112500879611, + "learning_rate": 0.0001432771324493879, + "loss": 0.7173, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.42664436452177007, + "learning_rate": 0.00014312128366968243, + "loss": 0.6814, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.5755636763054665, + "learning_rate": 0.00014296530612327863, + "loss": 0.8924, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.41226417870120463, + "learning_rate": 0.00014280920027594907, + "loss": 0.7581, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.41409852034763495, + "learning_rate": 0.00014265296659384956, + "loss": 0.6909, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.4824416084642549, + "learning_rate": 0.00014249660554351752, + "loss": 0.7301, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.47526961038746723, + "learning_rate": 0.00014234011759187083, + "loss": 0.7088, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4382117625168252, + "learning_rate": 0.00014218350320620624, + "loss": 0.727, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.3949202593465074, + "learning_rate": 0.00014202676285419812, + "loss": 0.6361, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.4840819307930419, + "learning_rate": 0.00014186989700389687, + "loss": 0.6759, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.40343798682203114, + "learning_rate": 0.0001417129061237278, + "loss": 0.665, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.39634190171344325, + "learning_rate": 0.0001415557906824895, + "loss": 0.6856, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.4607000428848122, + "learning_rate": 0.00014139855114935252, + "loss": 0.6786, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.49937064313118673, + "learning_rate": 0.00014124118799385796, + "loss": 0.7668, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.47729305823170465, + "learning_rate": 0.0001410837016859161, + "loss": 0.7371, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.5027456902687405, + "learning_rate": 0.00014092609269580496, + "loss": 0.7119, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.43275518606600494, + "learning_rate": 0.00014076836149416887, + "loss": 0.7195, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.44641691338004713, + "learning_rate": 0.00014061050855201723, + "loss": 0.7837, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.5351564724110155, + "learning_rate": 0.0001404525343407228, + "loss": 0.7391, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4133518842584368, + "learning_rate": 0.0001402944393320206, + "loss": 0.7433, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.4087438003882989, + "learning_rate": 0.00014013622399800627, + "loss": 0.6868, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.47086739260029037, + "learning_rate": 0.00013997788881113489, + "loss": 0.6866, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.45708602893778905, + "learning_rate": 0.00013981943424421932, + "loss": 0.7001, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.4152859381495282, + "learning_rate": 0.0001396608607704289, + "loss": 0.7203, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.5280364808819354, + "learning_rate": 0.0001395021688632882, + "loss": 0.7786, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4212996214832479, + "learning_rate": 0.00013934335899667527, + "loss": 0.7541, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.4470649164782845, + "learning_rate": 0.00013918443164482046, + "loss": 0.7063, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.4315979565521048, + "learning_rate": 0.000139025387282305, + "loss": 0.6626, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.42566473320011, + "learning_rate": 0.00013886622638405952, + "loss": 0.738, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.40467441171444274, + "learning_rate": 0.0001387069494253626, + "loss": 0.6889, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.37594266801130694, + "learning_rate": 0.0001385475568818394, + "loss": 0.7135, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4639301597199985, + "learning_rate": 0.00013838804922946027, + "loss": 0.7565, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.48709909205666824, + "learning_rate": 0.00013822842694453924, + "loss": 0.7843, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.4916092905206473, + "learning_rate": 0.0001380686905037327, + "loss": 0.8131, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.49642888986543793, + "learning_rate": 0.00013790884038403795, + "loss": 0.7182, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.45779062911232865, + "learning_rate": 0.00013774887706279165, + "loss": 0.7063, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.41682184977545866, + "learning_rate": 0.0001375888010176686, + "loss": 0.7044, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.5636949224513499, + "learning_rate": 0.00013742861272668012, + "loss": 0.8135, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.5214464152961691, + "learning_rate": 0.00013726831266817278, + "loss": 0.7709, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.4542017923917684, + "learning_rate": 0.00013710790132082692, + "loss": 0.7317, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.47749811632424743, + "learning_rate": 0.00013694737916365517, + "loss": 0.7397, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.3735648222986758, + "learning_rate": 0.00013678674667600102, + "loss": 0.6383, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.4016201210529084, + "learning_rate": 0.00013662600433753745, + "loss": 0.679, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.505896793729167, + "learning_rate": 0.00013646515262826552, + "loss": 0.8141, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.5604762114299658, + "learning_rate": 0.00013630419202851284, + "loss": 0.7992, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.48154668494328884, + "learning_rate": 0.00013614312301893223, + "loss": 0.7482, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3834678300713089, + "learning_rate": 0.0001359819460805001, + "loss": 0.6933, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.37959432979002766, + "learning_rate": 0.00013582066169451535, + "loss": 0.7097, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.42885235257161053, + "learning_rate": 0.0001356592703425976, + "loss": 0.7522, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4648928612908778, + "learning_rate": 0.0001354977725066859, + "loss": 0.7111, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.4577270811186029, + "learning_rate": 0.00013533616866903735, + "loss": 0.7886, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.4146231641601951, + "learning_rate": 0.0001351744593122255, + "loss": 0.6729, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4691410563463206, + "learning_rate": 0.00013501264491913906, + "loss": 0.7931, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.44006688074462075, + "learning_rate": 0.00013485072597298038, + "loss": 0.693, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.49475514221150574, + "learning_rate": 0.00013468870295726398, + "loss": 0.672, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3784851119779148, + "learning_rate": 0.0001345265763558152, + "loss": 0.6786, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.4031492933596611, + "learning_rate": 0.00013436434665276865, + "loss": 0.7517, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.48680566995254776, + "learning_rate": 0.00013420201433256689, + "loss": 0.7367, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.5113939087123414, + "learning_rate": 0.00013403957987995882, + "loss": 0.7802, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.4102874605034871, + "learning_rate": 0.00013387704377999842, + "loss": 0.7369, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.4171096051214564, + "learning_rate": 0.00013371440651804313, + "loss": 0.7368, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4524192577700863, + "learning_rate": 0.0001335516685797525, + "loss": 0.7456, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.3666143699110309, + "learning_rate": 0.00013338883045108674, + "loss": 0.7239, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.5440299357704128, + "learning_rate": 0.00013322589261830517, + "loss": 0.8337, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.46149288718804504, + "learning_rate": 0.00013306285556796495, + "loss": 0.7527, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.3801147835957714, + "learning_rate": 0.0001328997197869194, + "loss": 0.6629, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.4221683917992794, + "learning_rate": 0.0001327364857623168, + "loss": 0.6924, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4774992398891083, + "learning_rate": 0.00013257315398159864, + "loss": 0.7155, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.4708910106683695, + "learning_rate": 0.00013240972493249847, + "loss": 0.7459, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.5191159285599448, + "learning_rate": 0.0001322461991030402, + "loss": 0.7612, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.5036982816746822, + "learning_rate": 0.00013208257698153677, + "loss": 0.7711, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.41557325237011916, + "learning_rate": 0.00013191885905658872, + "loss": 0.6962, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.49612511563399675, + "learning_rate": 0.0001317550458170826, + "loss": 0.7896, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.4232564445836102, + "learning_rate": 0.00013159113775218964, + "loss": 0.6845, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.4757437427343549, + "learning_rate": 0.00013142713535136414, + "loss": 0.7722, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.45951650279539347, + "learning_rate": 0.00013126303910434214, + "loss": 0.7107, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4930026379152187, + "learning_rate": 0.00013109884950114007, + "loss": 0.8066, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.5154519145490679, + "learning_rate": 0.00013093456703205288, + "loss": 0.7432, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.43147853903317523, + "learning_rate": 0.00013077019218765305, + "loss": 0.7335, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.5245300853486963, + "learning_rate": 0.00013060572545878875, + "loss": 0.7684, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.48268223350859163, + "learning_rate": 0.0001304411673365826, + "loss": 0.7963, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.45626224106345264, + "learning_rate": 0.0001302765183124302, + "loss": 0.7106, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4816615903566304, + "learning_rate": 0.00013011177887799845, + "loss": 0.7542, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.44273772291567315, + "learning_rate": 0.00012994694952522435, + "loss": 0.743, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.4627967288211668, + "learning_rate": 0.00012978203074631334, + "loss": 0.7363, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4921961571550859, + "learning_rate": 0.00012961702303373795, + "loss": 0.792, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.4950946721863211, + "learning_rate": 0.00012945192688023624, + "loss": 0.7982, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.5001968723810016, + "learning_rate": 0.0001292867427788104, + "loss": 0.7497, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.4987396790002395, + "learning_rate": 0.00012912147122272523, + "loss": 0.7409, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.5033106613112274, + "learning_rate": 0.00012895611270550666, + "loss": 0.7685, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.35568005161962374, + "learning_rate": 0.0001287906677209403, + "loss": 0.6351, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4484372275365866, + "learning_rate": 0.00012862513676307008, + "loss": 0.7417, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.5534419189105819, + "learning_rate": 0.0001284595203261965, + "loss": 0.8277, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.48027637709844756, + "learning_rate": 0.00012829381890487536, + "loss": 0.7758, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.39266254437436654, + "learning_rate": 0.00012812803299391628, + "loss": 0.6756, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.4484161848088681, + "learning_rate": 0.00012796216308838117, + "loss": 0.7329, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.40397101383857004, + "learning_rate": 0.00012779620968358273, + "loss": 0.7604, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4642499099886137, + "learning_rate": 0.00012763017327508305, + "loss": 0.7229, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.36456119311729024, + "learning_rate": 0.00012746405435869198, + "loss": 0.6818, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.4460536435572603, + "learning_rate": 0.00012729785343046588, + "loss": 0.7805, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.46166211598194307, + "learning_rate": 0.0001271315709867059, + "loss": 0.7206, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.4733167613493444, + "learning_rate": 0.00012696520752395672, + "loss": 0.8046, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.4375935945924862, + "learning_rate": 0.00012679876353900482, + "loss": 0.7366, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.4940390389594289, + "learning_rate": 0.00012663223952887723, + "loss": 0.7666, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.4364323796942158, + "learning_rate": 0.00012646563599083996, + "loss": 0.7708, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.4096999973044443, + "learning_rate": 0.00012629895342239643, + "loss": 0.7294, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4361399645771164, + "learning_rate": 0.00012613219232128608, + "loss": 0.6885, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.44174351872813766, + "learning_rate": 0.00012596535318548289, + "loss": 0.7446, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.427523329003452, + "learning_rate": 0.0001257984365131938, + "loss": 0.7147, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.5778421408101975, + "learning_rate": 0.00012563144280285741, + "loss": 0.862, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.44459566364555486, + "learning_rate": 0.00012546437255314222, + "loss": 0.7121, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.4706629147593683, + "learning_rate": 0.0001252972262629454, + "loss": 0.6938, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.49030812004620705, + "learning_rate": 0.00012513000443139112, + "loss": 0.7943, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.4994646143202072, + "learning_rate": 0.00012496270755782914, + "loss": 0.7081, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.39750133719675035, + "learning_rate": 0.00012479533614183334, + "loss": 0.6595, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.45194558573661464, + "learning_rate": 0.00012462789068320017, + "loss": 0.7234, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.46899376059178066, + "learning_rate": 0.00012446037168194714, + "loss": 0.7355, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.48068632630673874, + "learning_rate": 0.00012429277963831148, + "loss": 0.7246, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.612549402178685, + "learning_rate": 0.00012412511505274844, + "loss": 0.8644, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.48657475109795506, + "learning_rate": 0.00012395737842592995, + "loss": 0.768, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.5062870143187943, + "learning_rate": 0.000123789570258743, + "loss": 0.8549, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.39742262719452504, + "learning_rate": 0.00012362169105228826, + "loss": 0.6501, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.3975895593026467, + "learning_rate": 0.00012345374130787854, + "loss": 0.6275, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.4420456644998677, + "learning_rate": 0.00012328572152703725, + "loss": 0.7184, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.6000175887702327, + "learning_rate": 0.000123117632211497, + "loss": 0.797, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.4845212329336108, + "learning_rate": 0.00012294947386319794, + "loss": 0.7368, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.4128619727170774, + "learning_rate": 0.0001227812469842864, + "loss": 0.7111, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.40976818344769816, + "learning_rate": 0.00012261295207711346, + "loss": 0.6721, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.5002400319715944, + "learning_rate": 0.00012244458964423327, + "loss": 0.7052, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.42928149943260674, + "learning_rate": 0.00012227616018840154, + "loss": 0.7343, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.46100917796831087, + "learning_rate": 0.0001221076642125742, + "loss": 0.7286, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.5142961676832801, + "learning_rate": 0.00012193910221990581, + "loss": 0.6487, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.5098026662721512, + "learning_rate": 0.00012177047471374807, + "loss": 0.7197, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.4460468154341747, + "learning_rate": 0.00012160178219764837, + "loss": 0.747, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.4775939193735352, + "learning_rate": 0.0001214330251753481, + "loss": 0.6684, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.4661822041472872, + "learning_rate": 0.00012126420415078132, + "loss": 0.7398, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3917799650828493, + "learning_rate": 0.00012109531962807332, + "loss": 0.6741, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.4467040975511843, + "learning_rate": 0.00012092637211153885, + "loss": 0.6942, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.5170019902686592, + "learning_rate": 0.0001207573621056809, + "loss": 0.7176, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3994721099123077, + "learning_rate": 0.00012058829011518896, + "loss": 0.719, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.5164083639501578, + "learning_rate": 0.00012041915664493761, + "loss": 0.7935, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.41581390005486735, + "learning_rate": 0.00012024996219998517, + "loss": 0.7246, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.47301085242615, + "learning_rate": 0.00012008070728557186, + "loss": 0.7147, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.38789396006274784, + "learning_rate": 0.00011991139240711857, + "loss": 0.7101, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.5357165894970772, + "learning_rate": 0.00011974201807022525, + "loss": 0.7068, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.42820463843571843, + "learning_rate": 0.00011957258478066931, + "loss": 0.7453, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.37348328995932895, + "learning_rate": 0.00011940309304440433, + "loss": 0.6773, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.4568102293336539, + "learning_rate": 0.00011923354336755835, + "loss": 0.6935, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.465763855201166, + "learning_rate": 0.00011906393625643244, + "loss": 0.6747, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.44343252011603657, + "learning_rate": 0.00011889427221749916, + "loss": 0.7298, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.41756697563581, + "learning_rate": 0.00011872455175740112, + "loss": 0.7099, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3543448389846333, + "learning_rate": 0.00011855477538294935, + "loss": 0.7016, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.4749102980088939, + "learning_rate": 0.00011838494360112185, + "loss": 0.7041, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.4843989649822722, + "learning_rate": 0.00011821505691906216, + "loss": 0.7386, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.43647707931454, + "learning_rate": 0.00011804511584407763, + "loss": 0.7496, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.4488231733014884, + "learning_rate": 0.00011787512088363817, + "loss": 0.7443, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.4260702659667508, + "learning_rate": 0.00011770507254537453, + "loss": 0.7232, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3986432094275747, + "learning_rate": 0.00011753497133707679, + "loss": 0.6624, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.4592828621112967, + "learning_rate": 0.00011736481776669306, + "loss": 0.74, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.4408939554219865, + "learning_rate": 0.00011719461234232764, + "loss": 0.68, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.5086358279517801, + "learning_rate": 0.00011702435557223987, + "loss": 0.6979, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.39525537490305795, + "learning_rate": 0.00011685404796484225, + "loss": 0.6846, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.4885877975258833, + "learning_rate": 0.00011668369002869912, + "loss": 0.6606, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.39425244906586043, + "learning_rate": 0.00011651328227252517, + "loss": 0.7186, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.5567465211281566, + "learning_rate": 0.00011634282520518383, + "loss": 0.7419, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.48497221446308514, + "learning_rate": 0.00011617231933568578, + "loss": 0.7072, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.452834992623621, + "learning_rate": 0.00011600176517318741, + "loss": 0.7357, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.40701039380651016, + "learning_rate": 0.00011583116322698935, + "loss": 0.7171, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.4105060383138123, + "learning_rate": 0.00011566051400653486, + "loss": 0.6545, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.43566126345189354, + "learning_rate": 0.00011548981802140848, + "loss": 0.7283, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.47208521131207976, + "learning_rate": 0.00011531907578133429, + "loss": 0.7091, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.5688134151204622, + "learning_rate": 0.00011514828779617459, + "loss": 0.7425, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.5443143232913138, + "learning_rate": 0.00011497745457592816, + "loss": 0.7445, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.48085819843103145, + "learning_rate": 0.00011480657663072896, + "loss": 0.6705, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.4386271549602791, + "learning_rate": 0.00011463565447084445, + "loss": 0.7251, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4096971176974968, + "learning_rate": 0.00011446468860667421, + "loss": 0.648, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.4993489649886308, + "learning_rate": 0.00011429367954874819, + "loss": 0.7352, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.5522085818405623, + "learning_rate": 0.0001141226278077254, + "loss": 0.7709, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.4400400359043301, + "learning_rate": 0.00011395153389439233, + "loss": 0.7175, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.40620390789782984, + "learning_rate": 0.00011378039831966134, + "loss": 0.7021, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.5572622124721404, + "learning_rate": 0.00011360922159456928, + "loss": 0.6601, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4323977666292517, + "learning_rate": 0.00011343800423027582, + "loss": 0.6909, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.476603405798883, + "learning_rate": 0.00011326674673806195, + "loss": 0.7392, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.49721643178524155, + "learning_rate": 0.00011309544962932862, + "loss": 0.8066, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4727030971806207, + "learning_rate": 0.0001129241134155949, + "loss": 0.7741, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.43043729497363453, + "learning_rate": 0.00011275273860849684, + "loss": 0.734, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.4891809662815345, + "learning_rate": 0.00011258132571978555, + "loss": 0.6993, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.5432057177158748, + "learning_rate": 0.00011240987526132594, + "loss": 0.7441, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.39716243401806606, + "learning_rate": 0.00011223838774509514, + "loss": 0.7452, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.5257584489427201, + "learning_rate": 0.00011206686368318086, + "loss": 0.6369, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.40698333482853216, + "learning_rate": 0.00011189530358778005, + "loss": 0.6332, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.4369951475725393, + "learning_rate": 0.00011172370797119712, + "loss": 0.7275, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.43995365876772313, + "learning_rate": 0.00011155207734584263, + "loss": 0.7402, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.40987317542711543, + "learning_rate": 0.00011138041222423177, + "loss": 0.7448, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.4932690579196363, + "learning_rate": 0.00011120871311898254, + "loss": 0.7736, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.5829479809151474, + "learning_rate": 0.0001110369805428146, + "loss": 0.7785, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.5123604082665411, + "learning_rate": 0.00011086521500854745, + "loss": 0.7132, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.47175680926389957, + "learning_rate": 0.0001106934170290991, + "loss": 0.7543, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.463546791041696, + "learning_rate": 0.00011052158711748434, + "loss": 0.6827, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4736667915714236, + "learning_rate": 0.00011034972578681338, + "loss": 0.7586, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.5095245626981498, + "learning_rate": 0.00011017783355029026, + "loss": 0.7394, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.4079615704673969, + "learning_rate": 0.00011000591092121127, + "loss": 0.7186, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.39594416147779593, + "learning_rate": 0.00010983395841296348, + "loss": 0.7293, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.49022952350120763, + "learning_rate": 0.0001096619765390232, + "loss": 0.8388, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.5171604362910454, + "learning_rate": 0.00010948996581295436, + "loss": 0.7446, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.39036446499127037, + "learning_rate": 0.00010931792674840718, + "loss": 0.7097, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.45214678839921707, + "learning_rate": 0.00010914585985911632, + "loss": 0.6657, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.44036420103706336, + "learning_rate": 0.00010897376565889971, + "loss": 0.6973, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.39197642684368633, + "learning_rate": 0.00010880164466165674, + "loss": 0.7057, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.3988590838084675, + "learning_rate": 0.00010862949738136681, + "loss": 0.6701, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.4958658645415372, + "learning_rate": 0.00010845732433208779, + "loss": 0.7006, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4456206698071224, + "learning_rate": 0.00010828512602795462, + "loss": 0.8273, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.39693850277163817, + "learning_rate": 0.00010811290298317755, + "loss": 0.6024, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.428211053877834, + "learning_rate": 0.00010794065571204072, + "loss": 0.6739, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4791909619530287, + "learning_rate": 0.00010776838472890065, + "loss": 0.7601, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.4750482800486702, + "learning_rate": 0.00010759609054818458, + "loss": 0.7445, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.41290528170872354, + "learning_rate": 0.00010742377368438914, + "loss": 0.652, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.42791752193612376, + "learning_rate": 0.00010725143465207867, + "loss": 0.6863, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.40641415334107067, + "learning_rate": 0.00010707907396588361, + "loss": 0.682, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.3878362011438658, + "learning_rate": 0.0001069066921404992, + "loss": 0.6437, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5024343126636618, + "learning_rate": 0.00010673428969068364, + "loss": 0.7408, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.4962270164379621, + "learning_rate": 0.00010656186713125689, + "loss": 0.7553, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.38549643158558333, + "learning_rate": 0.0001063894249770989, + "loss": 0.6732, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.43394431289703034, + "learning_rate": 0.00010621696374314807, + "loss": 0.6688, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.4294888638549087, + "learning_rate": 0.00010604448394439983, + "loss": 0.7451, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.513370767246734, + "learning_rate": 0.00010587198609590505, + "loss": 0.7381, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.38562627685147055, + "learning_rate": 0.00010569947071276847, + "loss": 0.6666, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.41283769392651115, + "learning_rate": 0.00010552693831014726, + "loss": 0.6441, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.4619707404051264, + "learning_rate": 0.0001053543894032493, + "loss": 0.7493, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4287926829979124, + "learning_rate": 0.00010518182450733186, + "loss": 0.6777, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.4160457016809911, + "learning_rate": 0.00010500924413769988, + "loss": 0.6726, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.5554187905057468, + "learning_rate": 0.00010483664880970457, + "loss": 0.7604, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.5159855991913099, + "learning_rate": 0.00010466403903874176, + "loss": 0.7763, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.41714889656788784, + "learning_rate": 0.00010449141534025045, + "loss": 0.7219, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.49842980465802544, + "learning_rate": 0.00010431877822971117, + "loss": 0.717, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5844190017128177, + "learning_rate": 0.00010414612822264455, + "loss": 0.8275, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.4595790009260156, + "learning_rate": 0.00010397346583460971, + "loss": 0.7924, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.41040216394669327, + "learning_rate": 0.0001038007915812028, + "loss": 0.717, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.44866665228577424, + "learning_rate": 0.00010362810597805526, + "loss": 0.6627, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.35373256778456563, + "learning_rate": 0.0001034554095408326, + "loss": 0.6602, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.4355239438501313, + "learning_rate": 0.00010328270278523256, + "loss": 0.7167, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.45168989893797795, + "learning_rate": 0.0001031099862269837, + "loss": 0.7315, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.433058877472146, + "learning_rate": 0.00010293726038184393, + "loss": 0.7167, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.5147396969205077, + "learning_rate": 0.00010276452576559879, + "loss": 0.7996, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.39820945052815354, + "learning_rate": 0.00010259178289406011, + "loss": 0.6183, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.4494938100536776, + "learning_rate": 0.00010241903228306431, + "loss": 0.7202, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.44647286417911763, + "learning_rate": 0.0001022462744484709, + "loss": 0.6793, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.45288656938564176, + "learning_rate": 0.00010207350990616107, + "loss": 0.6569, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.36496777988182755, + "learning_rate": 0.00010190073917203589, + "loss": 0.6806, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.38203936765064545, + "learning_rate": 0.00010172796276201503, + "loss": 0.6965, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.4709068591866945, + "learning_rate": 0.0001015551811920351, + "loss": 0.746, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.4219177375044993, + "learning_rate": 0.00010138239497804804, + "loss": 0.6893, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.5870826706336743, + "learning_rate": 0.00010120960463601976, + "loss": 0.7592, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.5327220178331229, + "learning_rate": 0.00010103681068192845, + "loss": 0.7329, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.41305113452194786, + "learning_rate": 0.00010086401363176305, + "loss": 0.658, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.46250787679599625, + "learning_rate": 0.00010069121400152181, + "loss": 0.6998, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4213464021773313, + "learning_rate": 0.00010051841230721065, + "loss": 0.6636, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.41013232445121084, + "learning_rate": 0.0001003456090648416, + "loss": 0.6652, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.44713210682610505, + "learning_rate": 0.00010017280479043147, + "loss": 0.694, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.43464675714158246, + "learning_rate": 0.0001, + "loss": 0.7232, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.4249593090191897, + "learning_rate": 9.982719520956855e-05, + "loss": 0.6795, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.38694978508371686, + "learning_rate": 9.965439093515841e-05, + "loss": 0.6764, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.5271099145074317, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7437, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.46321151785706344, + "learning_rate": 9.930878599847821e-05, + "loss": 0.7345, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.45221869725403974, + "learning_rate": 9.913598636823693e-05, + "loss": 0.7907, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.42539803281429966, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6517, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.36006699989058355, + "learning_rate": 9.879039536398024e-05, + "loss": 0.6864, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.36030444172155374, + "learning_rate": 9.861760502195197e-05, + "loss": 0.6732, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.6375235575589436, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7887, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.4803671988291979, + "learning_rate": 9.827203723798498e-05, + "loss": 0.7565, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.40744927417844706, + "learning_rate": 9.809926082796415e-05, + "loss": 0.7236, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6152319463546623, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7776, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.455641754941924, + "learning_rate": 9.775372555152912e-05, + "loss": 0.7718, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.4261067033513993, + "learning_rate": 9.758096771693573e-05, + "loss": 0.6904, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.45068920636638804, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6989, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.40785941519734986, + "learning_rate": 9.723547423440122e-05, + "loss": 0.6472, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.45278998387070374, + "learning_rate": 9.70627396181561e-05, + "loss": 0.7171, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4702138361071314, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7618, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.42087003933684464, + "learning_rate": 9.671729721476746e-05, + "loss": 0.6856, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.4529602989870126, + "learning_rate": 9.654459045916743e-05, + "loss": 0.6972, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.41772132583315275, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7117, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.46866569234472266, + "learning_rate": 9.619920841879725e-05, + "loss": 0.7737, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.41867640636136205, + "learning_rate": 9.602653416539031e-05, + "loss": 0.7189, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.44186665568231653, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7619, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.385017555406739, + "learning_rate": 9.568122177028884e-05, + "loss": 0.6549, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.43614042454037605, + "learning_rate": 9.550858465974958e-05, + "loss": 0.7709, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3762296247678466, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6497, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.47830120516263364, + "learning_rate": 9.516335119029546e-05, + "loss": 0.7929, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.5227073834645749, + "learning_rate": 9.499075586230013e-05, + "loss": 0.734, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.44905653509857996, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7087, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.42883202891996847, + "learning_rate": 9.464561059675073e-05, + "loss": 0.6814, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.40213194026312754, + "learning_rate": 9.44730616898528e-05, + "loss": 0.6643, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4333556142908201, + "learning_rate": 9.430052928723153e-05, + "loss": 0.687, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.41318618429500226, + "learning_rate": 9.412801390409497e-05, + "loss": 0.7211, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.39668297734504787, + "learning_rate": 9.395551605560018e-05, + "loss": 0.6753, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.39651168833521083, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6816, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.5556691657425049, + "learning_rate": 9.361057502290113e-05, + "loss": 0.7181, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.5302412334783085, + "learning_rate": 9.343813286874312e-05, + "loss": 0.7832, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.5133971152143451, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7923, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.478268161561676, + "learning_rate": 9.309330785950086e-05, + "loss": 0.671, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.42157207098216287, + "learning_rate": 9.292092603411641e-05, + "loss": 0.7259, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.46073184075292656, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7224, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.4442118173092234, + "learning_rate": 9.257622631561085e-05, + "loss": 0.6875, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.3731653885908938, + "learning_rate": 9.240390945181543e-05, + "loss": 0.7004, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3970596950162719, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7128, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.5842297674203025, + "learning_rate": 9.205934428795929e-05, + "loss": 0.6944, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.48127990585054503, + "learning_rate": 9.188709701682247e-05, + "loss": 0.753, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4600128750907082, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7023, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.4136237359433627, + "learning_rate": 9.154267566791223e-05, + "loss": 0.6706, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.5587048888071383, + "learning_rate": 9.137050261863324e-05, + "loss": 0.757, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3967339415854174, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6573, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.42270362295273634, + "learning_rate": 9.102623434110028e-05, + "loss": 0.681, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.4363570791175844, + "learning_rate": 9.085414014088369e-05, + "loss": 0.6836, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.4807524333102998, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6569, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.4505781908324612, + "learning_rate": 9.051003418704565e-05, + "loss": 0.7345, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.41450043002651243, + "learning_rate": 9.033802346097682e-05, + "loss": 0.708, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.402226024781564, + "learning_rate": 9.016604158703654e-05, + "loss": 0.714, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.4657621112858707, + "learning_rate": 8.999408907878877e-05, + "loss": 0.6965, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.44556512646731455, + "learning_rate": 8.982216644970979e-05, + "loss": 0.7212, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.47197005330850506, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7444, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.476622780345975, + "learning_rate": 8.947841288251568e-05, + "loss": 0.7529, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.48936308862553063, + "learning_rate": 8.930658297090091e-05, + "loss": 0.8097, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.5634106240839507, + "learning_rate": 8.913478499145254e-05, + "loss": 0.778, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.4122836612929818, + "learning_rate": 8.896301945718541e-05, + "loss": 0.6886, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.4245650764659275, + "learning_rate": 8.879128688101749e-05, + "loss": 0.679, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.40997646926570785, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7081, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.5391734428164183, + "learning_rate": 8.844792265415738e-05, + "loss": 0.7203, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.37763196105289126, + "learning_rate": 8.827629202880293e-05, + "loss": 0.7109, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.6457489522459617, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7399, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.5309623016242518, + "learning_rate": 8.793313631681915e-05, + "loss": 0.7563, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.4714886530100088, + "learning_rate": 8.776161225490489e-05, + "loss": 0.6578, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.42346170210757944, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7486, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.37881968964596124, + "learning_rate": 8.741867428021446e-05, + "loss": 0.6547, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.47059974199545634, + "learning_rate": 8.724726139150318e-05, + "loss": 0.6519, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.41995862073775514, + "learning_rate": 8.707588658440511e-05, + "loss": 0.707, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.4870154589149614, + "learning_rate": 8.690455037067141e-05, + "loss": 0.6871, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.5446337912386945, + "learning_rate": 8.673325326193806e-05, + "loss": 0.7219, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4600082573187779, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6752, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.602259027120918, + "learning_rate": 8.639077840543077e-05, + "loss": 0.7799, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.4195158570317328, + "learning_rate": 8.621960168033867e-05, + "loss": 0.7083, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.45297381521337465, + "learning_rate": 8.604846610560771e-05, + "loss": 0.76, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.44714546612521117, + "learning_rate": 8.587737219227462e-05, + "loss": 0.7611, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.5761347301832973, + "learning_rate": 8.570632045125185e-05, + "loss": 0.735, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.4066225432883208, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6534, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.42139487861342373, + "learning_rate": 8.536434552915556e-05, + "loss": 0.7228, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.4606520290345603, + "learning_rate": 8.519342336927105e-05, + "loss": 0.7882, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3970252044933702, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6499, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.4261427255932377, + "learning_rate": 8.485171220382545e-05, + "loss": 0.695, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.5251732898816305, + "learning_rate": 8.468092421866573e-05, + "loss": 0.714, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.41141372444674557, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7439, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.41889632476168526, + "learning_rate": 8.433948599346516e-05, + "loss": 0.6985, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.38595234209643636, + "learning_rate": 8.416883677301069e-05, + "loss": 0.6944, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.4351385284929873, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6671, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.46213495852699893, + "learning_rate": 8.382768066431425e-05, + "loss": 0.7162, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.38040261025201877, + "learning_rate": 8.36571747948162e-05, + "loss": 0.6336, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4778906493943159, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6506, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.5604201857765944, + "learning_rate": 8.33163099713009e-05, + "loss": 0.7572, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.37664708048191525, + "learning_rate": 8.31459520351578e-05, + "loss": 0.6771, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.47181259684094984, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6562, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.41530801771730214, + "learning_rate": 8.280538765767235e-05, + "loss": 0.6284, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.5174511494038859, + "learning_rate": 8.263518223330697e-05, + "loss": 0.6459, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.6389466638942793, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7926, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.4948881751553678, + "learning_rate": 8.22949274546255e-05, + "loss": 0.7238, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.3991432212775443, + "learning_rate": 8.212487911636184e-05, + "loss": 0.6606, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.46658729965932916, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7219, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.4413099807534331, + "learning_rate": 8.178494308093789e-05, + "loss": 0.7006, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.4958849327877985, + "learning_rate": 8.161505639887817e-05, + "loss": 0.6708, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.45632539986067144, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6722, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.5116081581399695, + "learning_rate": 8.127544824259889e-05, + "loss": 0.69, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.4883601563346209, + "learning_rate": 8.110572778250085e-05, + "loss": 0.7335, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5028990558941931, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6732, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.4719857979519838, + "learning_rate": 8.076645663244168e-05, + "loss": 0.701, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.46411915311096336, + "learning_rate": 8.059690695559568e-05, + "loss": 0.7122, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.3934084226407831, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6416, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.4648759855900219, + "learning_rate": 8.025798192977481e-05, + "loss": 0.7334, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.47281681448887125, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6864, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4394932571747774, + "learning_rate": 7.991929271442817e-05, + "loss": 0.65, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.4790191382059904, + "learning_rate": 7.975003780001485e-05, + "loss": 0.7583, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.5237037737491155, + "learning_rate": 7.958084335506239e-05, + "loss": 0.8054, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5098117771489851, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6473, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.4682109938448841, + "learning_rate": 7.924263789431912e-05, + "loss": 0.7574, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.35640481541049723, + "learning_rate": 7.907362788846116e-05, + "loss": 0.7113, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.38739878010398104, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6585, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.3983792120697055, + "learning_rate": 7.873579584921869e-05, + "loss": 0.7034, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.4113268423149, + "learning_rate": 7.856697482465196e-05, + "loss": 0.626, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4088959160760906, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6491, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.4274621841367322, + "learning_rate": 7.822952528625191e-05, + "loss": 0.6996, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.40221440742043035, + "learning_rate": 7.806089778009421e-05, + "loss": 0.6947, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.5283388538749225, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7336, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.40324576874310253, + "learning_rate": 7.772383981159849e-05, + "loss": 0.6637, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.42919137885726905, + "learning_rate": 7.755541035576677e-05, + "loss": 0.6753, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.440153196601291, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7209, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.42001973533574605, + "learning_rate": 7.721875301571359e-05, + "loss": 0.6795, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.5868176666072477, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6963, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.45549033769661046, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6951, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.44250320987177044, + "learning_rate": 7.671427847296275e-05, + "loss": 0.6385, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.39035373524284306, + "learning_rate": 7.654625869212146e-05, + "loss": 0.6368, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.464696040293888, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6875, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.4586068794220114, + "learning_rate": 7.6210429741257e-05, + "loss": 0.6837, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.41551188145094703, + "learning_rate": 7.604262157407007e-05, + "loss": 0.6586, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.403407353589169, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7105, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.4209550504605238, + "learning_rate": 7.570722036168854e-05, + "loss": 0.627, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.4025562030361292, + "learning_rate": 7.55396283180529e-05, + "loss": 0.6829, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.4115361755718086, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6782, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.46293428949464527, + "learning_rate": 7.520466385816671e-05, + "loss": 0.711, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.4525500352174917, + "learning_rate": 7.503729244217086e-05, + "loss": 0.7214, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.39438228243314793, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6468, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.49263504033528177, + "learning_rate": 7.470277373705461e-05, + "loss": 0.6148, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.39727946685820814, + "learning_rate": 7.453562744685778e-05, + "loss": 0.655, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4596553941643732, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7272, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.4282382813081575, + "learning_rate": 7.42015634868062e-05, + "loss": 0.6842, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.37547710323647004, + "learning_rate": 7.403464681451715e-05, + "loss": 0.6973, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4966657023913872, + "learning_rate": 7.386780767871397e-05, + "loss": 0.654, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.44782442402270567, + "learning_rate": 7.370104657760361e-05, + "loss": 0.6642, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.4381324912137808, + "learning_rate": 7.353436400916004e-05, + "loss": 0.7392, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4254768671365201, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6449, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.4424722561990435, + "learning_rate": 7.320123646099519e-05, + "loss": 0.6821, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.41725615187438503, + "learning_rate": 7.303479247604332e-05, + "loss": 0.7084, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.48119746299765054, + "learning_rate": 7.286842901329412e-05, + "loss": 0.742, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.5834048082380597, + "learning_rate": 7.270214656953415e-05, + "loss": 0.739, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.45966539280766744, + "learning_rate": 7.253594564130804e-05, + "loss": 0.7358, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5246437083991626, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6915, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.4555261618321849, + "learning_rate": 7.22037903164173e-05, + "loss": 0.6493, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.4235653668589521, + "learning_rate": 7.203783691161883e-05, + "loss": 0.7031, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4474416245759516, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6918, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.48694069827586794, + "learning_rate": 7.170618109512465e-05, + "loss": 0.7333, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.4376511170189795, + "learning_rate": 7.154047967380354e-05, + "loss": 0.7553, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.43980114940399667, + "learning_rate": 7.137486323692995e-05, + "loss": 0.733, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.4540225086137344, + "learning_rate": 7.12093322790597e-05, + "loss": 0.6927, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.4028754666797807, + "learning_rate": 7.104388729449338e-05, + "loss": 0.6917, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5450526119568553, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7245, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.40295745572301156, + "learning_rate": 7.071325722118963e-05, + "loss": 0.6052, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.4187400391892376, + "learning_rate": 7.054807311976379e-05, + "loss": 0.6958, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.407667069869259, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6963, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.4178318695815154, + "learning_rate": 7.021796925368667e-05, + "loss": 0.6729, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.4162537281857959, + "learning_rate": 7.005305047477566e-05, + "loss": 0.7507, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.3926901260513355, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6546, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.48529116831628166, + "learning_rate": 6.972348168756983e-05, + "loss": 0.6648, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.45305284197698503, + "learning_rate": 6.955883266341741e-05, + "loss": 0.6826, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4561702027539845, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6977, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.43187343070490947, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7343, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.5301500087261996, + "learning_rate": 6.906543296794714e-05, + "loss": 0.6863, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4747256643705912, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6237, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.4709266319943518, + "learning_rate": 6.873696089565786e-05, + "loss": 0.63, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.3974957399383896, + "learning_rate": 6.85728646486359e-05, + "loss": 0.6546, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.503966249187983, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7036, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.41288684339233356, + "learning_rate": 6.82449541829174e-05, + "loss": 0.6381, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.46913523549298286, + "learning_rate": 6.80811409434113e-05, + "loss": 0.6982, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.4752171520813708, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6727, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.4943113707571559, + "learning_rate": 6.775380089695986e-05, + "loss": 0.7352, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.4264027372481861, + "learning_rate": 6.759027506750158e-05, + "loss": 0.7045, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.7235056153582596, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7683, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.563107580693706, + "learning_rate": 6.726351423768322e-05, + "loss": 0.7123, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.40194345871372006, + "learning_rate": 6.710028021308061e-05, + "loss": 0.6651, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4202305543838757, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6785, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.46577617049452436, + "learning_rate": 6.677410738169485e-05, + "loss": 0.7267, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.4723083390327057, + "learning_rate": 6.661116954891328e-05, + "loss": 0.6962, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.39044608177447104, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6706, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.3892753596936745, + "learning_rate": 6.62855934819569e-05, + "loss": 0.6718, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.4701937200996348, + "learning_rate": 6.612295622000162e-05, + "loss": 0.6883, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.35726664334890657, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6548, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.4090789779534833, + "learning_rate": 6.579798566743314e-05, + "loss": 0.6547, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.3990999408536816, + "learning_rate": 6.563565334723134e-05, + "loss": 0.6635, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.4363843309324732, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7254, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.5106673101572715, + "learning_rate": 6.531129704273604e-05, + "loss": 0.7341, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.392666217890261, + "learning_rate": 6.514927402701964e-05, + "loss": 0.6688, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.48809746540697774, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7258, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.41397786052470315, + "learning_rate": 6.48255406877745e-05, + "loss": 0.6374, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.37907725401780545, + "learning_rate": 6.466383133096267e-05, + "loss": 0.6633, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3774594167209582, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6541, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.4890042336911856, + "learning_rate": 6.434072965740242e-05, + "loss": 0.6494, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.39851877805950653, + "learning_rate": 6.417933830548467e-05, + "loss": 0.7063, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.329191610211986, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6097, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.4903775189635333, + "learning_rate": 6.385687698106781e-05, + "loss": 0.7756, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.4962745895251398, + "learning_rate": 6.369580797148718e-05, + "loss": 0.697, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4215012367902803, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6696, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.5602802699887822, + "learning_rate": 6.337399566246257e-05, + "loss": 0.7679, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.42503625863823996, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6987, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.4557339264919862, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7613, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.3894184456838005, + "learning_rate": 6.289209867917312e-05, + "loss": 0.6568, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.43330359333084534, + "learning_rate": 6.273168733182722e-05, + "loss": 0.7066, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.44576363394684887, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7179, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.4145433388543684, + "learning_rate": 6.241119898233144e-05, + "loss": 0.7144, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.45566017580548274, + "learning_rate": 6.225112293720836e-05, + "loss": 0.7061, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3949098601987805, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6876, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.6521678312516719, + "learning_rate": 6.19313094962673e-05, + "loss": 0.6718, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.4331369024736756, + "learning_rate": 6.177157305546078e-05, + "loss": 0.6468, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.47666236297639186, + "learning_rate": 6.161195077053976e-05, + "loss": 0.746, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.44631760948927673, + "learning_rate": 6.145244311816063e-05, + "loss": 0.7215, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.40879940519529123, + "learning_rate": 6.129305057463741e-05, + "loss": 0.622, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.46886407286861076, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7576, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.3948554102536063, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6663, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.4259830535046582, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.7286, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.44144107124314336, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7311, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.543239191674462, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.7578, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.4872870664724931, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.6416, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4214827835586575, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6882, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.7102502410108671, + "learning_rate": 6.002211118886514e-05, + "loss": 0.7106, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.42612395704378414, + "learning_rate": 5.986377600199371e-05, + "loss": 0.6712, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4112579344710362, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6264, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.3992990179883584, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.6441, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.49175577071302734, + "learning_rate": 5.938949144798279e-05, + "loss": 0.7145, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.38660563173633494, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6236, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.4825247592155358, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6602, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.49627838405883506, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.7141, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.38542797334088236, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6469, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.4036454395837023, + "learning_rate": 5.860144885064751e-05, + "loss": 0.6091, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.3999331565255694, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.6556, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.4047271951245858, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6822, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.4638043930440745, + "learning_rate": 5.813010299610313e-05, + "loss": 0.7976, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.4140751557173669, + "learning_rate": 5.797323714580192e-05, + "loss": 0.6654, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.40175278138010456, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6837, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.448361264712076, + "learning_rate": 5.765988240812921e-05, + "loss": 0.7054, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.4903001942235706, + "learning_rate": 5.750339445648252e-05, + "loss": 0.7065, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.49209525819268396, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6848, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.40232475033847775, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.6638, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.42607086576536785, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.7159, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.487693420824358, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7019, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.4213757880177138, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.6985, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.42890405014839217, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.6592, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.4759282538023443, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6273, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.45053499198596025, + "learning_rate": 5.625609846363622e-05, + "loss": 0.6976, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.43882677659957103, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.6876, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.7611731610544257, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7676, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.41985260340167807, + "learning_rate": 5.579050500768836e-05, + "loss": 0.6873, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.642664100308076, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.7394, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4490334732239554, + "learning_rate": 5.54807686792933e-05, + "loss": 0.736, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.6671440485736441, + "learning_rate": 5.53260996957381e-05, + "loss": 0.7755, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.4648759376811008, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.6835, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.39704081725262047, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6712, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.4341131618860968, + "learning_rate": 5.486289500882355e-05, + "loss": 0.637, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.5132419370155383, + "learning_rate": 5.47087624046575e-05, + "loss": 0.6802, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3970131149411168, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6439, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.5563796139787245, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.7427, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.4727356660983191, + "learning_rate": 5.424717791025302e-05, + "loss": 0.7454, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4678442552896839, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.5738, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.49860848199843866, + "learning_rate": 5.394013727258254e-05, + "loss": 0.7387, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.45505539284147367, + "learning_rate": 5.378682303724435e-05, + "loss": 0.6501, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.40299107203734746, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6747, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.3893315999426221, + "learning_rate": 5.348060902265871e-05, + "loss": 0.6468, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.4146645361836117, + "learning_rate": 5.332771015781275e-05, + "loss": 0.6628, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.39969819955401586, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6847, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.5499794887435745, + "learning_rate": 5.302233099590928e-05, + "loss": 0.698, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.48647645191739924, + "learning_rate": 5.286985161076029e-05, + "loss": 0.6309, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4285453792077148, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6534, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.4324294114731592, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.6817, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.46608044171446206, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.694, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4073665107443134, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6852, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.45869252338084976, + "learning_rate": 5.210957484346314e-05, + "loss": 0.6808, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.5828362701029322, + "learning_rate": 5.195794670011776e-05, + "loss": 0.7766, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4269296331948093, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7202, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.4519904640620554, + "learning_rate": 5.165512124837344e-05, + "loss": 0.7003, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.6444979308466544, + "learning_rate": 5.150392484425728e-05, + "loss": 0.6902, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.4699420609867648, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7015, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.5022268277610995, + "learning_rate": 5.120196693701267e-05, + "loss": 0.7158, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.46082265208514983, + "learning_rate": 5.105120633557634e-05, + "loss": 0.6412, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.5103161933623592, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7312, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.5147347915351437, + "learning_rate": 5.075012408804458e-05, + "loss": 0.7794, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.5421872091812179, + "learning_rate": 5.059980334102637e-05, + "loss": 0.668, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.46411215498995456, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7255, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.3887518797785626, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.6083, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.45110254673438077, + "learning_rate": 5.014972799220403e-05, + "loss": 0.7377, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.3958967453111445, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6958, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.423884885430956, + "learning_rate": 4.985042131538545e-05, + "loss": 0.7114, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.4146595568379947, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.7451, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4220457869659306, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6996, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.3810707534390687, + "learning_rate": 4.940258557148765e-05, + "loss": 0.6529, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.3810992708104994, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.632, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.4637435578141067, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7968, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.4832133631853069, + "learning_rate": 4.895610964891923e-05, + "loss": 0.7038, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.36830718999500317, + "learning_rate": 4.880758859890536e-05, + "loss": 0.679, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.41448228827687683, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6287, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.4261682036368644, + "learning_rate": 4.851100554686021e-05, + "loss": 0.7054, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.39173878941998846, + "learning_rate": 4.836294443047088e-05, + "loss": 0.6892, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4612650363372707, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7067, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.5415273429390662, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.7567, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.42985550421095275, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6789, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4514876328128349, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7334, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.48622030317888465, + "learning_rate": 4.762496061632814e-05, + "loss": 0.6965, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.48515398417323474, + "learning_rate": 4.747783129228656e-05, + "loss": 0.649, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.42337213509437976, + "learning_rate": 4.733085880741301e-05, + "loss": 0.5853, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.46739335531326504, + "learning_rate": 4.718404360058966e-05, + "loss": 0.6734, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.43779643398988793, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.6782, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.4600479763327274, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6969, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.39366032769643583, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.653, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.37558272896056105, + "learning_rate": 4.659836431497563e-05, + "loss": 0.6196, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.37272200377118736, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6228, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.42945206411674275, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6268, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.6166029413629494, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.7235, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.42629619211839953, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7182, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.3671463855501407, + "learning_rate": 4.586985643347717e-05, + "loss": 0.5879, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.4094433707586426, + "learning_rate": 4.572463804170263e-05, + "loss": 0.6218, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.4157481878534906, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.618, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.4274197173120849, + "learning_rate": 4.543468791472131e-05, + "loss": 0.669, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.3970755103030679, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.6384, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.45084601233135135, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7519, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.40924399535224726, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.6525, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.5987694684687338, + "learning_rate": 4.485674639850333e-05, + "loss": 0.6564, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.41156457909001926, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6745, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.5675373666675694, + "learning_rate": 4.456876191254582e-05, + "loss": 0.7554, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.4530565326014445, + "learning_rate": 4.442501774383515e-05, + "loss": 0.7656, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.43455739336857785, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7639, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.4625805066144766, + "learning_rate": 4.413802770115816e-05, + "loss": 0.7301, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.4687573326755966, + "learning_rate": 4.399478268418771e-05, + "loss": 0.6784, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.5400521150678994, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6784, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.37891632845292245, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.6627, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.4485527940764225, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6813, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4540503467371925, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6693, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.4865435283342518, + "learning_rate": 4.328107473805487e-05, + "loss": 0.715, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.43575127743652914, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.6665, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.43019359102950366, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6586, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.5113079554729141, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.748, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.4821203181807719, + "learning_rate": 4.271315449981934e-05, + "loss": 0.7528, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.42134366694499037, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6756, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.4504160027209184, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.6741, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.4042945996342561, + "learning_rate": 4.228900904120895e-05, + "loss": 0.666, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.5034168527915088, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7473, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.4730515114462499, + "learning_rate": 4.200710636738189e-05, + "loss": 0.7597, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.3375261764602711, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.5817, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.42148433248647954, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6713, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.3536704135783801, + "learning_rate": 4.158555222253771e-05, + "loss": 0.6422, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.4732608797163496, + "learning_rate": 4.14453824841132e-05, + "loss": 0.7479, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.5396160418355413, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6324, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.4842880189383717, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.6485, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.5563890271607748, + "learning_rate": 4.102592405835536e-05, + "loss": 0.6521, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4425083354838779, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7191, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.5980880189944785, + "learning_rate": 4.074716493968975e-05, + "loss": 0.644, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.48233570275749316, + "learning_rate": 4.060805057932359e-05, + "loss": 0.7742, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.4012527813911815, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6813, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.3679257098356443, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.605, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.40018105835742707, + "learning_rate": 4.019177327749822e-05, + "loss": 0.6156, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.4308179433569843, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7381, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.5272604799209487, + "learning_rate": 3.991514736790258e-05, + "loss": 0.65, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.4421385760524196, + "learning_rate": 3.977710334046193e-05, + "loss": 0.7449, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.44588841990396755, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6753, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.40033860076889666, + "learning_rate": 3.950155520139581e-05, + "loss": 0.6779, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.4538456241393483, + "learning_rate": 3.936405191259891e-05, + "loss": 0.6184, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3666606205717183, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6501, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.4934538346550755, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.5735, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.6109550738227458, + "learning_rate": 3.895263009479534e-05, + "loss": 0.7158, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.4751922921016551, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6365, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.4442858112293756, + "learning_rate": 3.867925968395085e-05, + "loss": 0.6313, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.4502163638711434, + "learning_rate": 3.854284894414122e-05, + "loss": 0.719, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.3850076588185577, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6237, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.5012850995227692, + "learning_rate": 3.82705784324618e-05, + "loss": 0.6857, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.4261235683833505, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.6391, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3706138666410532, + "learning_rate": 3.79990452539225e-05, + "loss": 0.5812, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.4297917510015146, + "learning_rate": 3.786355617847385e-05, + "loss": 0.7014, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.5049950909930079, + "learning_rate": 3.772825265187802e-05, + "loss": 0.6924, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.410115601309095, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6814, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.435910679791374, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.6852, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.3304471307752587, + "learning_rate": 3.732345940279893e-05, + "loss": 0.5666, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.5115953250373663, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7767, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.45214157826179824, + "learning_rate": 3.705453237352227e-05, + "loss": 0.6494, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.39820922009077997, + "learning_rate": 3.692035060534088e-05, + "loss": 0.6352, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4789431281088089, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6866, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.40275237465807845, + "learning_rate": 3.665255256532638e-05, + "loss": 0.6591, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.3713650013349836, + "learning_rate": 3.651893709317887e-05, + "loss": 0.6425, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.41188885924377205, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6242, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.34893506665356233, + "learning_rate": 3.625227523958252e-05, + "loss": 0.5932, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 1.2163344446598188, + "learning_rate": 3.611922965442648e-05, + "loss": 0.6846, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4067397288108614, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6401, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.42477780694654677, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.6874, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.44182676800461945, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.6943, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.41404454441862243, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7061, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.3942082685818331, + "learning_rate": 3.545687101972013e-05, + "loss": 0.6874, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.4279982410771876, + "learning_rate": 3.53249759200601e-05, + "loss": 0.6441, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.41782546710583973, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6537, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.41077064496653026, + "learning_rate": 3.506176550233863e-05, + "loss": 0.7045, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.3880835455808636, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6458, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4377648053400171, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6573, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.465498029493161, + "learning_rate": 3.46684052203088e-05, + "loss": 0.665, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.518912253454699, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.7123, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.4592564284357289, + "learning_rate": 3.440713983000601e-05, + "loss": 0.692, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.421272376232594, + "learning_rate": 3.427680074531113e-05, + "loss": 0.6788, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.4190421951105113, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.6089, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.5296063611058046, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7328, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.4119536279993497, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6775, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.4372127695497076, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.7381, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4623282475892554, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6621, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.41289449863476224, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.6196, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.480771875212797, + "learning_rate": 3.336994413891828e-05, + "loss": 0.7198, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.39296287900786697, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6006, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.41689104434774116, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.6478, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.43257497028506464, + "learning_rate": 3.298426809706928e-05, + "loss": 0.6128, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.4681766355573039, + "learning_rate": 3.285610914348332e-05, + "loss": 0.703, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.5935400521657089, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.7587, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.5077586112824047, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.6681, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.4606788504253986, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7005, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.4409361567025991, + "learning_rate": 3.234548216567049e-05, + "loss": 0.72, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.390447354877361, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.6881, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.46312060958483303, + "learning_rate": 3.209137931341143e-05, + "loss": 0.5681, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.4367757132898544, + "learning_rate": 3.196463187590929e-05, + "loss": 0.6619, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.4553356814837973, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.6796, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.45225260368863773, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6866, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.4536254760360822, + "learning_rate": 3.158561005793402e-05, + "loss": 0.7353, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.45167273611910475, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6506, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4653418492705205, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6487, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.43782464350968103, + "learning_rate": 3.120842689807468e-05, + "loss": 0.6619, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.4204359056290449, + "learning_rate": 3.108310952230212e-05, + "loss": 0.6473, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.4300521099061381, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6369, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.44165546763416386, + "learning_rate": 3.083309253324651e-05, + "loss": 0.6794, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.43339208508125643, + "learning_rate": 3.070839366655215e-05, + "loss": 0.6419, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.40186803579797137, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6658, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.3841899309149006, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.6107, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.4158909919301638, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6268, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.659571482120607, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7197, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.4482032932856501, + "learning_rate": 3.008801048763914e-05, + "loss": 0.6397, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.4761032227320628, + "learning_rate": 2.996455867635155e-05, + "loss": 0.7583, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.48003199160702137, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7259, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.4074685491078901, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6619, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.7582833567287415, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.7086, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4186350368778986, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6425, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.4875366017253042, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6936, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.47152194894082033, + "learning_rate": 2.922825253307947e-05, + "loss": 0.6941, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5077366430562386, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7384, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.4844444506906377, + "learning_rate": 2.898450393337977e-05, + "loss": 0.6448, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.567970942644562, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.7004, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.45285637611447, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6252, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.39904939346272067, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.7037, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.40870457636421614, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.6915, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.38517232522524636, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6231, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.3980631285502874, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.6637, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.38569816720663086, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.6396, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4235640896081435, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6323, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.3669973174944435, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.6365, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.502319781637613, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.729, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4443826452871617, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6004, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.4009814414355795, + "learning_rate": 2.753992680872457e-05, + "loss": 0.68, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.4072069258362513, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6438, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.534313603465075, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7025, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.38522779892576314, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.6212, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.42540386104077615, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.6505, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.4415591223220511, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6922, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.4974882957691649, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.6332, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.4572713358634199, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.6756, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5632009516879064, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6886, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.5654612611201677, + "learning_rate": 2.647690737490106e-05, + "loss": 0.7607, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.46848708751036583, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.6297, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4019437430654483, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6642, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.5391388624920869, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.7346, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.39762033129290314, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6499, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.6089772015665591, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7228, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.4091684206732233, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6212, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.3875516194768679, + "learning_rate": 2.566239608465838e-05, + "loss": 0.6148, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.5087414360417498, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.669, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.42052223640750874, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6819, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.5571920739198382, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.7379, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.5589725779775973, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6986, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.4077774865425764, + "learning_rate": 2.508725484101684e-05, + "loss": 0.5673, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.3966051236938129, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.6, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4274422908439089, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6335, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.33771537843772004, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.5881, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.4093214392340917, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.6448, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.44006827822558175, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6848, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.3957670014836868, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.6361, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.42111179500586743, + "learning_rate": 2.429146201687538e-05, + "loss": 0.7322, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.447595877658861, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6413, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.5004926499095018, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6786, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.465628029025143, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.6439, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4516378355070736, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6734, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.4193879525975684, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.6887, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.37092129339802005, + "learning_rate": 2.361816641743303e-05, + "loss": 0.6312, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.4105063644947912, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.713, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.36374646860495985, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6263, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.5074406776993624, + "learning_rate": 2.328459328616759e-05, + "loss": 0.6642, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.4128235360455528, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6121, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.36038965284139735, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6256, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.39572687509499344, + "learning_rate": 2.295308190543859e-05, + "loss": 0.6351, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.423484534839522, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.645, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.4386416765973164, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.6079, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.39285308813316205, + "learning_rate": 2.262364118471805e-05, + "loss": 0.6764, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.4459364112421892, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6397, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.39168469410823503, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.6507, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.3722640125985169, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.6252, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4355454966323861, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7141, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.3430399297126906, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6126, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.4583537527387117, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.6296, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.4279278274481135, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.645, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.5038785683153161, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.6757, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.48090150321401093, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.638, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.423908701621885, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6314, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.4564407646195038, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6831, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.5008191526678891, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.6883, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.45733730495896463, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6117, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.42993085879339066, + "learning_rate": 2.111388852214001e-05, + "loss": 0.699, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.476292986154224, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.7199, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4102089662875839, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6555, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.4180688510909825, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.6948, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.38743362692091166, + "learning_rate": 2.069097260929439e-05, + "loss": 0.643, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4149369355782099, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6803, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.37950622060268036, + "learning_rate": 2.048093436450603e-05, + "loss": 0.655, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.4943825903704621, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.7194, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.400978347396589, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6154, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.4775873407139853, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.6247, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.4553479375148142, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.7256, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4244814429336164, + "learning_rate": 1.995999968955641e-05, + "loss": 0.681, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.4508251541923756, + "learning_rate": 1.985652854842247e-05, + "loss": 0.6884, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.6243044339114079, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.6957, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.44969579306877716, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6317, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.43554840209066353, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.5915, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.5896809712135934, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.7204, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.46883068997796684, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.686, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.4670614214856696, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.6685, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.46998743057063663, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.7182, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4386788250764763, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6414, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.4136227444175789, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.6453, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.5414548857949473, + "learning_rate": 1.883503039577894e-05, + "loss": 0.6031, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.3719962187265138, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6053, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.4177683383290942, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.6506, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.4729371368750245, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.6814, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3262855975367515, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.589, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.5480520090986583, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.773, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.44517043921596944, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.6706, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.577481543003918, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7275, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.4397967027612655, + "learning_rate": 1.803526775107217e-05, + "loss": 0.6359, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.3918583356457884, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.6502, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4776416558413356, + "learning_rate": 1.783776873795994e-05, + "loss": 0.5677, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.4420162143207249, + "learning_rate": 1.773938710748706e-05, + "loss": 0.6636, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.4443006247321744, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.6446, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.4503667815472766, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6492, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.4612675657557288, + "learning_rate": 1.744571724358789e-05, + "loss": 0.6477, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.43209222342026876, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.6564, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.5477723334156583, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7517, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.40481915910603655, + "learning_rate": 1.715426605184407e-05, + "loss": 0.6442, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.4947924933794212, + "learning_rate": 1.705761004839911e-05, + "loss": 0.7493, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.5931479437790562, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7587, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.40706977530746524, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6408, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.45563104066901616, + "learning_rate": 1.676912926028007e-05, + "loss": 0.6549, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.4315762607401044, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.714, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.4711245706366215, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.6258, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.46143225200024446, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.5804, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.44584803737341006, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7378, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.4803316915992794, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.6421, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.3871107029547407, + "learning_rate": 1.619888594394382e-05, + "loss": 0.6333, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.45914687917486513, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6542, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.4508811891950307, + "learning_rate": 1.601080376443763e-05, + "loss": 0.6193, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.5939130372404645, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.6265, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.46173118075953334, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7207, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.4440593688395496, + "learning_rate": 1.573056222621453e-05, + "loss": 0.6581, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.4251307059431981, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.659, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.46460031624996845, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7175, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.5848587823882815, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.7212, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.590168152666092, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6946, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4675547950081361, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6913, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.4467296778332705, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6609, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.4102157735989331, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.6468, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4413627034705757, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6379, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.44763494794833336, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.6661, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.40904324118452856, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.6629, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.4387769979377223, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7335, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.5082678181995608, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.6619, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.49378095505218844, + "learning_rate": 1.454244833620102e-05, + "loss": 0.6885, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.3781146245279851, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6381, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.4403455658984126, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.6521, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.449520980445605, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.6525, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.5040550123853111, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6642, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.5171102560168016, + "learning_rate": 1.409693244743192e-05, + "loss": 0.7313, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.4243066672704072, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.6828, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5606977986193548, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7393, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.356765312259291, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.5923, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.38578952542997724, + "learning_rate": 1.37451354812416e-05, + "loss": 0.6018, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4055517387530648, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6542, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.387200711065784, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.69, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.46122566793370784, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.6456, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3779164218156875, + "learning_rate": 1.339745962155613e-05, + "loss": 0.631, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.41905632593435094, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6634, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.3492251627583365, + "learning_rate": 1.322517230541096e-05, + "loss": 0.5941, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.5269141861545891, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6662, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.4041237366118928, + "learning_rate": 1.30539214797198e-05, + "loss": 0.6543, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.3417551454751479, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.5916, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.367928203255809, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6329, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.44284394360614165, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.6217, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.5289192468404308, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.6665, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.41253131494010853, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6808, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.39958183092768723, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.6281, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.40604548549275743, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6353, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4127316174238306, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6235, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.39994693947875093, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.6515, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.46198926568901394, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.7261, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.36620587997299, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.4028536167555855, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.6736, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.45078438510895197, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.6627, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.4273797825239026, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6443, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.45175325555552603, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.6592, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.5273387684892102, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.7525, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.3901100788076458, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6339, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.45213561226244414, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.7457, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.5300440695732356, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.7088, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.49408808988181024, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.728, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.40975088663541037, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.6124, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.4303423444928659, + "learning_rate": 1.123914688596409e-05, + "loss": 0.6476, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.382474083230923, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.62, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.47548658272504035, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.6212, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.49857153711888064, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.6032, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.478559876485903, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6869, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.43451585111407687, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.659, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.4067218835749194, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.6459, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.4488765971746705, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.7156, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.5039706276132117, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.6485, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.4660787888411466, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.6908, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5086288860613459, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7498, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.42840707314228765, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.618, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.4930273945764172, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.706, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.40006850983935494, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6097, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.4335664772092174, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.6549, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.38758194133564866, + "learning_rate": 1.007519208596045e-05, + "loss": 0.6469, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4352181098736593, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6362, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.43614995094953685, + "learning_rate": 9.924546254786493e-06, + "loss": 0.6483, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.3734845483574157, + "learning_rate": 9.849626695403324e-06, + "loss": 0.6248, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4673738756550724, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6286, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.46931770163566056, + "learning_rate": 9.700595407649805e-06, + "loss": 0.6718, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.5054076199605804, + "learning_rate": 9.62648412430951e-06, + "loss": 0.6601, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.4156887330306433, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6871, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.4848129151230194, + "learning_rate": 9.479071385238892e-06, + "loss": 0.6318, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.4090143670275399, + "learning_rate": 9.40577036970538e-06, + "loss": 0.7051, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.47023117574919554, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6403, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.4855511342637403, + "learning_rate": 9.259980141081115e-06, + "loss": 0.7308, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.42127353506362863, + "learning_rate": 9.187491363342093e-06, + "loss": 0.6605, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.399395936860529, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6525, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.3648858569255673, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6127, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.4425814065357602, + "learning_rate": 8.971652971536148e-06, + "loss": 0.6974, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.479016021723167, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6837, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.44310138485519696, + "learning_rate": 8.829119474567671e-06, + "loss": 0.6285, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.4340924327388896, + "learning_rate": 8.758260995011825e-06, + "loss": 0.6982, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.43148264237357026, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6611, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.48868884574412247, + "learning_rate": 8.617361631727138e-06, + "loss": 0.7426, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.4120744248577442, + "learning_rate": 8.547321168745193e-06, + "loss": 0.6261, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.6190391045957169, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7688, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.4049601799461113, + "learning_rate": 8.408059725858719e-06, + "loss": 0.6023, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.39554572306685953, + "learning_rate": 8.338839161809997e-06, + "loss": 0.686, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4286225445682662, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6289, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.4845861275686903, + "learning_rate": 8.201219382016556e-06, + "loss": 0.626, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.3963837981727752, + "learning_rate": 8.132820577225387e-06, + "loss": 0.5965, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.4134911418769395, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6541, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.42104755345183614, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6809, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.4259978224833793, + "learning_rate": 7.929270951805178e-06, + "loss": 0.7122, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4380553719130741, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6813, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.4273175887634649, + "learning_rate": 7.794945549701993e-06, + "loss": 0.643, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.6612426177197686, + "learning_rate": 7.728195756009204e-06, + "loss": 0.6687, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.33655424416475366, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6255, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.4771674213689379, + "learning_rate": 7.595522979965819e-06, + "loss": 0.7404, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.3952446378554752, + "learning_rate": 7.529600393796232e-06, + "loss": 0.6337, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.41448139569062153, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6876, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.4288409230911495, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.7102, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.3783481327021028, + "learning_rate": 7.333490202478666e-06, + "loss": 0.6534, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.4542569647870413, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6881, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.4006263879464807, + "learning_rate": 7.204133330911178e-06, + "loss": 0.694, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.453401714144608, + "learning_rate": 7.1398704525792e-06, + "loss": 0.5955, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.44396075332125057, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6449, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.464087674884783, + "learning_rate": 7.012176770311862e-06, + "loss": 0.6529, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.46665628649754415, + "learning_rate": 6.948746347689183e-06, + "loss": 0.6134, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4267206777431721, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6107, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.45369038854807936, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.6156, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.49824396400889714, + "learning_rate": 6.760123024328624e-06, + "loss": 0.7031, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.41115424024889685, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5949, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.43144386097553383, + "learning_rate": 6.635765971293484e-06, + "loss": 0.5647, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.42295463970623487, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.6755, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.41488207349200906, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6194, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.47559587547989834, + "learning_rate": 6.451321849032288e-06, + "loss": 0.6773, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.42148859792374616, + "learning_rate": 6.390398932093555e-06, + "loss": 0.6323, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.5122183779481083, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6266, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.44424699537254064, + "learning_rate": 6.269391876739495e-06, + "loss": 0.63, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.442110611521818, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6242, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.4545369208026764, + "learning_rate": 6.149504395842087e-06, + "loss": 0.727, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.5105146615317737, + "learning_rate": 6.089980943839924e-06, + "loss": 0.5459, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.49004569063844167, + "learning_rate": 6.030737921409169e-06, + "loss": 0.6407, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4359967625922056, + "learning_rate": 5.971775505458444e-06, + "loss": 0.652, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.4677743393043125, + "learning_rate": 5.913093872058528e-06, + "loss": 0.7219, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.42052686908942477, + "learning_rate": 5.854693196441641e-06, + "loss": 0.6367, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5224132859527966, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6171, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.4200233190413443, + "learning_rate": 5.738735415290642e-06, + "loss": 0.6195, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.525695660376052, + "learning_rate": 5.681178656024055e-06, + "loss": 0.7904, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.40558947882316576, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6588, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.5103020883576876, + "learning_rate": 5.566910259474289e-06, + "loss": 0.657, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.3873266453233226, + "learning_rate": 5.510198963413881e-06, + "loss": 0.5752, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4285241457568737, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6976, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.40857593789774727, + "learning_rate": 5.397623022464226e-06, + "loss": 0.6062, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.5410195800482295, + "learning_rate": 5.341758713743828e-06, + "loss": 0.7412, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.40797808336174124, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6788, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.560901318464611, + "learning_rate": 5.230878253907912e-06, + "loss": 0.7575, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.5212007528527816, + "learning_rate": 5.175862433898282e-06, + "loss": 0.6335, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.5186025566667148, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6791, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.362825359016172, + "learning_rate": 5.066680435123106e-06, + "loss": 0.6582, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.4175997612171381, + "learning_rate": 5.012514582391592e-06, + "loss": 0.7087, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4361299595187199, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6422, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.40379275500431583, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6474, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.5432519210652561, + "learning_rate": 4.851719549248301e-06, + "loss": 0.7521, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.49489904176517713, + "learning_rate": 4.798689246727006e-06, + "loss": 0.726, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.4778114674274999, + "learning_rate": 4.745943229770122e-06, + "loss": 0.6466, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.4933705071723949, + "learning_rate": 4.693481655885257e-06, + "loss": 0.665, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.5589503002222523, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7616, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.47525552375930347, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6679, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.4113964591102893, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6147, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3689891073538882, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6669, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.4628204941072679, + "learning_rate": 4.435445885824285e-06, + "loss": 0.6799, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.4468459877387718, + "learning_rate": 4.384694230432984e-06, + "loss": 0.6032, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3947347800824203, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6312, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.5701409670608103, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.7194, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.4151374112707517, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.6649, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4793828838725344, + "learning_rate": 4.184544329761009e-06, + "loss": 0.718, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.7068717534009299, + "learning_rate": 4.135221781914034e-06, + "loss": 0.6686, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.47401068139669833, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.6603, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.44061731871766496, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5891, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.4207110761408695, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6806, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.3874035896421297, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.59, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.40253686610510786, + "learning_rate": 3.892905960127546e-06, + "loss": 0.556, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.5415411228332792, + "learning_rate": 3.845303192289074e-06, + "loss": 0.6956, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.3945963732370113, + "learning_rate": 3.797987556970495e-06, + "loss": 0.5793, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4366079996575693, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6568, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.44634155267347175, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.6857, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.4115469207308905, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.609, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.5419266391289995, + "learning_rate": 3.611599153858214e-06, + "loss": 0.7071, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.47637699928223176, + "learning_rate": 3.565721283350931e-06, + "loss": 0.698, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.5723400801568562, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.7531, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4009591922291512, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6807, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.4331252958668281, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6509, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.399769059999911, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.6546, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.45085868281053915, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7071, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.5514151785507176, + "learning_rate": 3.296506110302422e-06, + "loss": 0.752, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.3959419936368739, + "learning_rate": 3.252646840332918e-06, + "loss": 0.6068, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4483500145944997, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6213, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.36596013283213197, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6135, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.3603482051100899, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.6312, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.3998640794375178, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6364, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.428649379549905, + "learning_rate": 3.037686613916857e-06, + "loss": 0.6464, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.390593369924257, + "learning_rate": 2.995562691985898e-06, + "loss": 0.6983, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.6016004599654001, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7652, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.5882288235315833, + "learning_rate": 2.912183982969385e-06, + "loss": 0.6941, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.4613460981333321, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.6764, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3898156569663358, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6392, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.4382461918130465, + "learning_rate": 2.789290617426765e-06, + "loss": 0.6724, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.5305696960018302, + "learning_rate": 2.748906571878207e-06, + "loss": 0.687, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.4187652465149059, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6663, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.5111558907436493, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.6815, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.34906468871197094, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.5988, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.42478348658580695, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6394, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.33392343235361127, + "learning_rate": 2.551344823532964e-06, + "loss": 0.5464, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.42316395196008005, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.639, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.43602319797551964, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6127, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.34533336200222786, + "learning_rate": 2.436298790049363e-06, + "loss": 0.5881, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.3875861538559915, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.6516, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.493267360371915, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6516, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.41237596426549245, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.6859, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.4083831627195735, + "learning_rate": 2.286983355164529e-06, + "loss": 0.6716, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4115879104570026, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6518, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.47996363631082506, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.5994, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.41235060776600496, + "learning_rate": 2.178060137750071e-06, + "loss": 0.6065, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.39209566990715516, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6153, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.39731442729571154, + "learning_rate": 2.106905034576112e-06, + "loss": 0.6739, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.5135213180159308, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.7119, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.42965918561188526, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6858, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.40389515210214166, + "learning_rate": 2.002365067264289e-06, + "loss": 0.5784, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.5590010970100625, + "learning_rate": 1.968103545249611e-06, + "loss": 0.649, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.41491347188958494, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6251, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.45496219950380323, + "learning_rate": 1.900458817025097e-06, + "loss": 0.5803, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.44954579665324645, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.6048, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4134655703681969, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6379, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.46066883694485544, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.6663, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.3984938279054454, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.6545, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.39823108645855065, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6879, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.7422035044982245, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.6672, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.517550103030721, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.7166, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.4406193421538898, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6439, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.44058810187732483, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.6405, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.3964524908097487, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.6255, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.43469904654814184, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6235, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.5106227882923048, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.6526, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.70962295043018, + "learning_rate": 1.489364501100332e-06, + "loss": 0.7534, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4423402123776244, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6327, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.45116531658470177, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6964, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.5864062161055378, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.6849, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.4509488119478894, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6471, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.4381582202926343, + "learning_rate": 1.344477780953346e-06, + "loss": 0.6643, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.4237960794460431, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.6879, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4374546398625835, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6416, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.5221175916812671, + "learning_rate": 1.261080262743297e-06, + "loss": 0.639, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.356264189927237, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.5445, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.38973756787229913, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.5965, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.543973752427358, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.805, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.42979347962101977, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6575, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.5042407052213125, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6405, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.5241413589166609, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.674, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.408613955885076, + "learning_rate": 1.076809502472831e-06, + "loss": 0.7184, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4379860229388707, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6262, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.4323480463923861, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6519, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.38506660557803357, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6361, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.37880504506736284, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6247, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.39685044634962247, + "learning_rate": 9.540479264726676e-07, + "loss": 0.5881, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.4194357098536144, + "learning_rate": 9.303826211592315e-07, + "loss": 0.6035, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.43625634338608676, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6912, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.4027428662526194, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6952, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.42632449428971303, + "learning_rate": 8.611620049653879e-07, + "loss": 0.6388, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.5337207026513404, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7587, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.3697182493057979, + "learning_rate": 8.16495030759501e-07, + "loss": 0.5743, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.4531441112225663, + "learning_rate": 7.946057760332193e-07, + "loss": 0.681, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3584300034986864, + "learning_rate": 7.730127636723539e-07, + "loss": 0.618, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.4226329000909182, + "learning_rate": 7.517160581569372e-07, + "loss": 0.7039, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.42418772454024434, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6533, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.405422813829527, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6175, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.45982871735958847, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6622, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.4975151975727332, + "learning_rate": 6.694935631773258e-07, + "loss": 0.6809, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.5625783691772048, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6318, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.4858590432799323, + "learning_rate": 6.301617681886863e-07, + "loss": 0.751, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.4433252691438286, + "learning_rate": 6.109409416834688e-07, + "loss": 0.6705, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.5128465720864208, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6579, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.4935130039822559, + "learning_rate": 5.733897176325665e-07, + "loss": 0.6647, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.3941506153553523, + "learning_rate": 5.550594322205504e-07, + "loss": 0.6243, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.4136221075040379, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6288, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.43436322990507925, + "learning_rate": 5.192897883082747e-07, + "loss": 0.6246, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.45907323619964835, + "learning_rate": 5.018505366216175e-07, + "loss": 0.5889, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.41713400266272166, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6305, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.3953181357796512, + "learning_rate": 4.678634341683252e-07, + "loss": 0.6505, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.3943289612083844, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6478, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.47474097580682073, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6493, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.5115671907494317, + "learning_rate": 4.191120373120749e-07, + "loss": 0.701, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.41664514407348774, + "learning_rate": 4.034562351727389e-07, + "loss": 0.581, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4034158594351879, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.7405, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.40086384740895586, + "learning_rate": 3.73036907948543e-07, + "loss": 0.6511, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.5085314721751837, + "learning_rate": 3.582734737004101e-07, + "loss": 0.6701, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3627036576986965, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.662, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.3969669843973643, + "learning_rate": 3.296392843612273e-07, + "loss": 0.6636, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.4279935043259893, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.6628, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.46402161198748143, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6347, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.4316028438126866, + "learning_rate": 2.889203328748424e-07, + "loss": 0.6276, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.4873488395493903, + "learning_rate": 2.759428007315212e-07, + "loss": 0.6645, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3930856325704254, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.5831, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.44410962630873474, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.664, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.5129387411644766, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.6779, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.420044883229394, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.5983, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.42475271473599874, + "learning_rate": 2.15522751523467e-07, + "loss": 0.5825, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.41883555551198826, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.6679, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4240520730595356, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6697, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.42882446982717437, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.6238, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.45484800450889074, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.6526, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.49953186634440994, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6902, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.44691847958582187, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.6406, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.43100974826637617, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.5908, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.4631337704823913, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6253, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.42084410540376455, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6199, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.4552591722852423, + "learning_rate": 1.170343437301491e-07, + "loss": 0.7056, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3643185236453847, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5695, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.48570227590289416, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.6864, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.532717249543278, + "learning_rate": 9.330275400666332e-08, + "loss": 0.6669, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.43681308469909813, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6378, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.46099230085208504, + "learning_rate": 7.8973337634336e-08, + "loss": 0.6504, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.44090056851603815, + "learning_rate": 7.225618800222877e-08, + "loss": 0.7049, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.46180798760173736, + "learning_rate": 6.583743778106887e-08, + "loss": 0.596, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.4391562689172221, + "learning_rate": 5.971710613821291e-08, + "loss": 0.6667, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.8626806184707471, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6449, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.43409069557301966, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6563, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.41969529930274724, + "learning_rate": 4.314680098592705e-08, + "loss": 0.6356, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.4222269715265879, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.6429, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.4670697419201925, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6474, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.4452819177418937, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.6875, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.463490060267024, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.6633, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3940813094632287, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.65, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.44305740563962925, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.7005, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.4651866154502036, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.6124, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4674151975111781, + "learning_rate": 1.209367398504746e-08, + "loss": 0.5779, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.4823441352205061, + "learning_rate": 9.555535917993297e-09, + "loss": 0.7893, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.5016534944918397, + "learning_rate": 7.315984495548378e-09, + "loss": 0.6479, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.38102575372409986, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6086, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.4143079328856298, + "learning_rate": 3.732667443390181e-09, + "loss": 0.6794, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.48552698446001646, + "learning_rate": 2.388912514017516e-09, + "loss": 0.6634, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4271016805768734, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5702, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.4467084259531285, + "learning_rate": 5.972299119250125e-10, + "loss": 0.6671, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.5044855288814932, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.61, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.4426977553290052, + "learning_rate": 0.0, + "loss": 0.6366, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1615624508571648.0, + "train_loss": 0.728021297454834, + "train_runtime": 29030.5177, + "train_samples_per_second": 1.033, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1615624508571648.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e4609419f1cec8e876ad4aefe228cb4adf6620a --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "down_proj", + "k_proj", + "up_proj", + "v_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ee8c74ecb6d491e525e8268856af3fb554b1f09a --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95a28f5ce059aa30ed757830ec3ba2e43f2714c43c7380841383a7717f868ee9 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..89d28814449aa3d58a811a1c46c975b24271dbc6 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b7e7d8002a045bd7b515936259cf5e3afbf0ef2e0bfa46c06962624cb61706c +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1dbe9cf566f9c639c583d2ab630f7845dffc4595 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.9187046636727805, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.3821, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.0792624480572932, + "learning_rate": 7.017543859649123e-06, + "loss": 1.4152, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 1.1509679477952406, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5638, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 1.0153140074758051, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.4518, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.8305882584346376, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.2897, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9580646718718947, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4666, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.904298840595612, + "learning_rate": 2.456140350877193e-05, + "loss": 1.2781, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 1.0013840824825258, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.2608, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8790398686631322, + "learning_rate": 3.157894736842105e-05, + "loss": 1.2048, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.9106038042230968, + "learning_rate": 3.508771929824561e-05, + "loss": 1.0825, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.8364248025570733, + "learning_rate": 3.859649122807018e-05, + "loss": 1.0387, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8471561810879394, + "learning_rate": 4.210526315789474e-05, + "loss": 1.015, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.7953795863229357, + "learning_rate": 4.56140350877193e-05, + "loss": 0.9882, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.8785775945766162, + "learning_rate": 4.912280701754386e-05, + "loss": 1.0482, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 0.723417623282454, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.9608, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.7611912751822496, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.9426, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.7252600959832706, + "learning_rate": 5.9649122807017544e-05, + "loss": 0.9846, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.74931191744177, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0573, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.8045241840406074, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0575, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.6035180579178393, + "learning_rate": 7.017543859649122e-05, + "loss": 0.9674, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5945409277287728, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9895, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.7344825474590768, + "learning_rate": 7.719298245614036e-05, + "loss": 0.9581, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.53022118934138, + "learning_rate": 8.070175438596491e-05, + "loss": 0.8633, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6036465457595838, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9019, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.7434093244468174, + "learning_rate": 8.771929824561403e-05, + "loss": 1.0828, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.5789996465537413, + "learning_rate": 9.12280701754386e-05, + "loss": 0.9046, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.6207781817193246, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9771, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.50118463017605, + "learning_rate": 9.824561403508771e-05, + "loss": 0.8243, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5150599428286029, + "learning_rate": 0.0001017543859649123, + "loss": 0.8724, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.5974204555861369, + "learning_rate": 0.00010526315789473685, + "loss": 0.9403, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.46479309739544894, + "learning_rate": 0.00010877192982456141, + "loss": 0.8376, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.5924711783520056, + "learning_rate": 0.00011228070175438597, + "loss": 0.9561, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6199971213166166, + "learning_rate": 0.00011578947368421053, + "loss": 0.9147, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.5861895660841031, + "learning_rate": 0.00011929824561403509, + "loss": 0.9514, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.5650218068326689, + "learning_rate": 0.00012280701754385965, + "loss": 0.8629, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.4992585437751919, + "learning_rate": 0.0001263157894736842, + "loss": 0.8666, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.6587479357119546, + "learning_rate": 0.0001298245614035088, + "loss": 0.8885, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.45989413876422613, + "learning_rate": 0.00013333333333333334, + "loss": 0.8707, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5541375304460954, + "learning_rate": 0.0001368421052631579, + "loss": 0.9042, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.48678040316016685, + "learning_rate": 0.00014035087719298245, + "loss": 0.8844, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.5724673519486235, + "learning_rate": 0.00014385964912280703, + "loss": 0.8746, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5773561902579017, + "learning_rate": 0.00014736842105263158, + "loss": 0.8465, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.522888769255152, + "learning_rate": 0.00015087719298245616, + "loss": 0.9228, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.5010094318522428, + "learning_rate": 0.0001543859649122807, + "loss": 0.8968, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.7323733472135996, + "learning_rate": 0.00015789473684210527, + "loss": 0.8296, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.46966043829200105, + "learning_rate": 0.00016140350877192982, + "loss": 0.822, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.4784577654699424, + "learning_rate": 0.0001649122807017544, + "loss": 0.8326, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5225519559970078, + "learning_rate": 0.00016842105263157895, + "loss": 0.9327, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.5305933486499982, + "learning_rate": 0.00017192982456140353, + "loss": 0.866, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.48942561215519315, + "learning_rate": 0.00017543859649122806, + "loss": 0.8911, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5438001724361794, + "learning_rate": 0.00017894736842105264, + "loss": 0.8168, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.5252206611833089, + "learning_rate": 0.0001824561403508772, + "loss": 0.8275, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.4802938547461449, + "learning_rate": 0.00018596491228070177, + "loss": 0.8618, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5118393802144957, + "learning_rate": 0.00018947368421052632, + "loss": 0.8513, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.4612064453172076, + "learning_rate": 0.00019298245614035088, + "loss": 0.8184, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.48808752152242685, + "learning_rate": 0.00019649122807017543, + "loss": 0.8178, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.5108056735883002, + "learning_rate": 0.0002, + "loss": 0.9261, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.45353549861879516, + "learning_rate": 0.00019999985069241055, + "loss": 0.7657, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.502886332585044, + "learning_rate": 0.00019999940277008808, + "loss": 0.7874, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.5072212825451821, + "learning_rate": 0.00019999865623437013, + "loss": 0.8058, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.49202407084104344, + "learning_rate": 0.00019999761108748597, + "loss": 0.7515, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.578940194689475, + "learning_rate": 0.00019999626733255662, + "loss": 0.9381, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.6085015022875951, + "learning_rate": 0.00019999462497359466, + "loss": 0.9601, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.5075157023962301, + "learning_rate": 0.00019999268401550447, + "loss": 0.8628, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.535946755129502, + "learning_rate": 0.000199990444464082, + "loss": 0.8856, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.42942779507639506, + "learning_rate": 0.00019998790632601496, + "loss": 0.7587, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.48832417407332307, + "learning_rate": 0.00019998506960888256, + "loss": 0.8236, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5739164258014042, + "learning_rate": 0.00019998193432115572, + "loss": 0.8619, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.7300022332551496, + "learning_rate": 0.0001999785004721968, + "loss": 0.8394, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.557117083548834, + "learning_rate": 0.00019997476807225985, + "loss": 0.833, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.49123527914022486, + "learning_rate": 0.0001999707371324904, + "loss": 0.722, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.6932706617976879, + "learning_rate": 0.00019996640766492543, + "loss": 0.8676, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.5373073147194048, + "learning_rate": 0.00019996177968249334, + "loss": 0.7757, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.5315954660113272, + "learning_rate": 0.0001999568531990141, + "loss": 0.8667, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.5507964885116554, + "learning_rate": 0.00019995162822919883, + "loss": 0.9107, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.5456497491184378, + "learning_rate": 0.00019994610478865011, + "loss": 0.9106, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.5416706546625402, + "learning_rate": 0.0001999402828938618, + "loss": 0.8239, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4995789896391561, + "learning_rate": 0.00019993416256221895, + "loss": 0.8562, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.4799575897021053, + "learning_rate": 0.00019992774381199778, + "loss": 0.8265, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.4437816317223756, + "learning_rate": 0.00019992102666236566, + "loss": 0.7988, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.46714732306952683, + "learning_rate": 0.00019991401113338104, + "loss": 0.8644, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.5700411679789269, + "learning_rate": 0.00019990669724599336, + "loss": 0.8827, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.47593653750268666, + "learning_rate": 0.00019989908502204292, + "loss": 0.8986, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5442012033062404, + "learning_rate": 0.00019989117448426108, + "loss": 0.8896, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.5160866877355936, + "learning_rate": 0.00019988296565626987, + "loss": 0.834, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.49848101926340194, + "learning_rate": 0.00019987445856258206, + "loss": 0.7804, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5434634623520227, + "learning_rate": 0.00019986565322860115, + "loss": 0.9614, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.5625199742085497, + "learning_rate": 0.00019985654968062122, + "loss": 0.7917, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.4231055623029346, + "learning_rate": 0.00019984714794582683, + "loss": 0.75, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.5293710754735215, + "learning_rate": 0.00019983744805229296, + "loss": 0.8658, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.5558763172233137, + "learning_rate": 0.000199827450028985, + "loss": 0.9193, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.41754803852935723, + "learning_rate": 0.00019981715390575858, + "loss": 0.7898, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.6256971093758742, + "learning_rate": 0.00019980655971335945, + "loss": 0.8806, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.43284536757002134, + "learning_rate": 0.00019979566748342347, + "loss": 0.7827, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.44742761100470746, + "learning_rate": 0.00019978447724847652, + "loss": 0.7217, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4168981829618602, + "learning_rate": 0.00019977298904193437, + "loss": 0.7092, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.5575162923981515, + "learning_rate": 0.00019976120289810247, + "loss": 0.8252, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.5302261971536973, + "learning_rate": 0.00019974911885217608, + "loss": 0.8462, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.47874017698553667, + "learning_rate": 0.00019973673694024, + "loss": 0.8181, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.5012430924834531, + "learning_rate": 0.0001997240571992685, + "loss": 0.788, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.5048653719282753, + "learning_rate": 0.00019971107966712518, + "loss": 0.8573, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.49047578456287433, + "learning_rate": 0.00019969780438256293, + "loss": 0.7898, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.4970060165629044, + "learning_rate": 0.0001996842313852238, + "loss": 0.8532, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.5401711101860304, + "learning_rate": 0.00019967036071563877, + "loss": 0.7758, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.5153031149175047, + "learning_rate": 0.0001996561924152278, + "loss": 0.8737, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.5559208186733371, + "learning_rate": 0.0001996417265262996, + "loss": 0.8369, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.4716374538788385, + "learning_rate": 0.00019962696309205148, + "loss": 0.8803, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.550845985188517, + "learning_rate": 0.0001996119021565693, + "loss": 0.8375, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.42156146280243834, + "learning_rate": 0.0001995965437648273, + "loss": 0.753, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.48509810772114836, + "learning_rate": 0.00019958088796268793, + "loss": 0.8412, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.5023266195694092, + "learning_rate": 0.0001995649347969019, + "loss": 0.8301, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.513339172085789, + "learning_rate": 0.00019954868431510764, + "loss": 0.8015, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.6203368739647647, + "learning_rate": 0.00019953213656583168, + "loss": 0.9331, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5355397165316813, + "learning_rate": 0.00019951529159848805, + "loss": 0.8685, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.449465322541369, + "learning_rate": 0.00019949814946337838, + "loss": 0.7457, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.4642561650693295, + "learning_rate": 0.00019948071021169174, + "loss": 0.8127, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.514369238120762, + "learning_rate": 0.00019946297389550433, + "loss": 0.8131, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.47582294101026024, + "learning_rate": 0.00019944494056777946, + "loss": 0.8402, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.4561765696046747, + "learning_rate": 0.00019942661028236745, + "loss": 0.8312, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.4088762133518043, + "learning_rate": 0.00019940798309400526, + "loss": 0.7634, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.5327556918534464, + "learning_rate": 0.00019938905905831654, + "loss": 0.8029, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.4452671115414226, + "learning_rate": 0.00019936983823181132, + "loss": 0.8139, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5333061507594044, + "learning_rate": 0.0001993503206718859, + "loss": 0.8561, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.5268131404123165, + "learning_rate": 0.00019933050643682269, + "loss": 0.8553, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.4290420019311977, + "learning_rate": 0.00019931039558578997, + "loss": 0.7942, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.7470777506995214, + "learning_rate": 0.00019928998817884182, + "loss": 0.8548, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.48833920253500857, + "learning_rate": 0.00019926928427691786, + "loss": 0.8028, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.5456098386553334, + "learning_rate": 0.00019924828394184306, + "loss": 0.7936, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5519753117074031, + "learning_rate": 0.00019922698723632767, + "loss": 0.7578, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.5262059249313784, + "learning_rate": 0.0001992053942239668, + "loss": 0.9176, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.48905451098926384, + "learning_rate": 0.0001991835049692405, + "loss": 0.8056, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5041885687187533, + "learning_rate": 0.00019916131953751342, + "loss": 0.8784, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.65992878211206, + "learning_rate": 0.0001991388379950346, + "loss": 0.8824, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.5217525886734332, + "learning_rate": 0.0001991160604089374, + "loss": 0.7804, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.5270075627577061, + "learning_rate": 0.00019909298684723904, + "loss": 0.7688, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.5568832639357242, + "learning_rate": 0.00019906961737884077, + "loss": 0.8603, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.5789452452609885, + "learning_rate": 0.00019904595207352737, + "loss": 0.7881, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.46035737500928864, + "learning_rate": 0.00019902199100196697, + "loss": 0.8328, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.589470313748269, + "learning_rate": 0.000198997734235711, + "loss": 0.9397, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.6242353402465898, + "learning_rate": 0.00019897318184719385, + "loss": 0.8824, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.5050850899431011, + "learning_rate": 0.00019894833390973266, + "loss": 0.7984, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.5655401544347279, + "learning_rate": 0.0001989231904975272, + "loss": 0.8426, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.47035047199101027, + "learning_rate": 0.00019889775168565943, + "loss": 0.7603, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5223121829849106, + "learning_rate": 0.00019887201755009357, + "loss": 0.7486, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.5794215317840429, + "learning_rate": 0.00019884598816767563, + "loss": 0.8598, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.5378971548326088, + "learning_rate": 0.0001988196636161333, + "loss": 0.7919, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5343936949760187, + "learning_rate": 0.0001987930439740757, + "loss": 0.8111, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.5811898199506572, + "learning_rate": 0.00019876612932099308, + "loss": 0.8998, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.5835742724881763, + "learning_rate": 0.0001987389197372567, + "loss": 0.8931, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.47510131365145686, + "learning_rate": 0.00019871141530411853, + "loss": 0.8002, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.5761874411759367, + "learning_rate": 0.00019868361610371097, + "loss": 0.786, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.5960372142616529, + "learning_rate": 0.00019865552221904665, + "loss": 0.9059, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.6080990060254836, + "learning_rate": 0.0001986271337340182, + "loss": 0.7834, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.5129966938004138, + "learning_rate": 0.00019859845073339787, + "loss": 0.7771, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.41987896899713706, + "learning_rate": 0.00019856947330283752, + "loss": 0.801, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6334277283030731, + "learning_rate": 0.00019854020152886814, + "loss": 0.9359, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.5141312787322188, + "learning_rate": 0.0001985106354988997, + "loss": 0.8135, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.5090938010451109, + "learning_rate": 0.00019848077530122083, + "loss": 0.781, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5379179147704597, + "learning_rate": 0.0001984506210249986, + "loss": 0.8338, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.524448342726279, + "learning_rate": 0.00019842017276027832, + "loss": 0.7992, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.49958124015235883, + "learning_rate": 0.00019838943059798304, + "loss": 0.758, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4637100100117015, + "learning_rate": 0.00019835839462991361, + "loss": 0.7357, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.8660919139153875, + "learning_rate": 0.0001983270649487481, + "loss": 0.7904, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.5204581180046053, + "learning_rate": 0.0001982954416480417, + "loss": 0.7835, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.6988181581621069, + "learning_rate": 0.00019826352482222638, + "loss": 0.7778, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.4410751304362726, + "learning_rate": 0.00019823131456661063, + "loss": 0.7945, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.5624349211581529, + "learning_rate": 0.00019819881097737915, + "loss": 0.8347, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5558362478822403, + "learning_rate": 0.00019816601415159263, + "loss": 0.8885, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.5087247847943858, + "learning_rate": 0.00019813292418718732, + "loss": 0.8215, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.5548702062919935, + "learning_rate": 0.0001980995411829749, + "loss": 0.7815, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5408759607428516, + "learning_rate": 0.0001980658652386421, + "loss": 0.8004, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.5902227242683268, + "learning_rate": 0.0001980318964547504, + "loss": 0.911, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.42505372400837477, + "learning_rate": 0.0001979976349327357, + "loss": 0.8491, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4700905729949172, + "learning_rate": 0.00019796308077490817, + "loss": 0.8265, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.5905772236187293, + "learning_rate": 0.00019792823408445174, + "loss": 0.8424, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.5331782932345496, + "learning_rate": 0.0001978930949654239, + "loss": 0.9053, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.7431522727178561, + "learning_rate": 0.00019785766352275542, + "loss": 0.8982, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.5079475839431324, + "learning_rate": 0.00019782193986224995, + "loss": 0.8392, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.48924012256515104, + "learning_rate": 0.00019778592409058378, + "loss": 0.8213, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.5182419828560418, + "learning_rate": 0.00019774961631530545, + "loss": 0.7863, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.5010632560906729, + "learning_rate": 0.0001977130166448355, + "loss": 0.8226, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.5204323847809875, + "learning_rate": 0.00019767612518846608, + "loss": 0.8305, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.44514518800717273, + "learning_rate": 0.00019763894205636072, + "loss": 0.7703, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.5117132531480701, + "learning_rate": 0.00019760146735955388, + "loss": 0.8138, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.533659319178954, + "learning_rate": 0.00019756370120995066, + "loss": 0.7735, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.421932266465524, + "learning_rate": 0.00019752564372032657, + "loss": 0.7841, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.5016188476076787, + "learning_rate": 0.000197487295004327, + "loss": 0.8096, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.4866681536952131, + "learning_rate": 0.00019744865517646706, + "loss": 0.837, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.5212920654667302, + "learning_rate": 0.00019740972435213115, + "loss": 0.843, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.597959752214, + "learning_rate": 0.0001973705026475726, + "loss": 0.7984, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.5432818174149677, + "learning_rate": 0.00019733099017991341, + "loss": 0.7558, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5810158726425654, + "learning_rate": 0.00019729118706714375, + "loss": 0.7635, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.5099329403500529, + "learning_rate": 0.0001972510934281218, + "loss": 0.7993, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.4615061145478328, + "learning_rate": 0.00019721070938257324, + "loss": 0.7879, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.45842820312923444, + "learning_rate": 0.00019717003505109095, + "loss": 0.8371, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.6025405029850975, + "learning_rate": 0.0001971290705551347, + "loss": 0.8774, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.42477595208472296, + "learning_rate": 0.00019708781601703065, + "loss": 0.8295, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4382418484595955, + "learning_rate": 0.00019704627155997108, + "loss": 0.79, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.5340809779964995, + "learning_rate": 0.00019700443730801413, + "loss": 0.8423, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.6657501599265587, + "learning_rate": 0.00019696231338608316, + "loss": 0.9512, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5014340782665344, + "learning_rate": 0.00019691989991996663, + "loss": 0.8572, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.4525484927078218, + "learning_rate": 0.00019687719703631755, + "loss": 0.8157, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.6142326781051878, + "learning_rate": 0.00019683420486265327, + "loss": 0.8234, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5709852030076734, + "learning_rate": 0.0001967909235273549, + "loss": 0.9266, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.5185305175522327, + "learning_rate": 0.0001967473531596671, + "loss": 0.808, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.4382876729973094, + "learning_rate": 0.0001967034938896976, + "loss": 0.7584, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5590060126752588, + "learning_rate": 0.00019665934584841682, + "loss": 0.8973, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.49611786385112067, + "learning_rate": 0.0001966149091676575, + "loss": 0.8716, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.5049439999019059, + "learning_rate": 0.00019657018398011434, + "loss": 0.7737, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.46712456182633916, + "learning_rate": 0.00019652517041934356, + "loss": 0.7859, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.6081204529268448, + "learning_rate": 0.00019647986861976246, + "loss": 0.8756, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.43838191084634615, + "learning_rate": 0.0001964342787166491, + "loss": 0.8247, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4258857376163505, + "learning_rate": 0.00019638840084614182, + "loss": 0.7771, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.593866093205508, + "learning_rate": 0.0001963422351452389, + "loss": 0.7866, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.4070136732910013, + "learning_rate": 0.0001962957817517982, + "loss": 0.7615, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.47446825115829455, + "learning_rate": 0.00019624904080453655, + "loss": 0.7493, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.534291016113494, + "learning_rate": 0.00019620201244302952, + "loss": 0.8327, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.5456977282318515, + "learning_rate": 0.00019615469680771096, + "loss": 0.8143, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.48373802875797434, + "learning_rate": 0.00019610709403987246, + "loss": 0.8229, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.5203841178534363, + "learning_rate": 0.00019605920428166323, + "loss": 0.8083, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.5629772908785765, + "learning_rate": 0.00019601102767608923, + "loss": 0.9057, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.561895722100035, + "learning_rate": 0.00019596256436701324, + "loss": 0.8782, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.7321171724122697, + "learning_rate": 0.00019591381449915397, + "loss": 0.8782, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.4447398608848945, + "learning_rate": 0.00019586477821808597, + "loss": 0.7897, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.4949712922597885, + "learning_rate": 0.000195815455670239, + "loss": 0.825, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.6690982394856024, + "learning_rate": 0.00019576584700289768, + "loss": 0.8946, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.44254020743234357, + "learning_rate": 0.00019571595236420102, + "loss": 0.8046, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.435787744927338, + "learning_rate": 0.00019566577190314197, + "loss": 0.8009, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.5023689018286775, + "learning_rate": 0.00019561530576956703, + "loss": 0.7905, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.4493265664785308, + "learning_rate": 0.00019556455411417573, + "loss": 0.7665, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.5362087506786596, + "learning_rate": 0.0001955135170885202, + "loss": 0.8088, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.4450062730399959, + "learning_rate": 0.00019546219484500475, + "loss": 0.7885, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.4632062205666545, + "learning_rate": 0.00019541058753688538, + "loss": 0.8023, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4316089402576449, + "learning_rate": 0.00019535869531826937, + "loss": 0.73, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.4253684655222452, + "learning_rate": 0.00019530651834411474, + "loss": 0.7307, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.49194412033330853, + "learning_rate": 0.00019525405677022989, + "loss": 0.7542, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5524366372423557, + "learning_rate": 0.00019520131075327298, + "loss": 0.8655, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.4569670439362406, + "learning_rate": 0.0001951482804507517, + "loss": 0.7719, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.4621197739520012, + "learning_rate": 0.00019509496602102252, + "loss": 0.7435, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.5996689512472547, + "learning_rate": 0.00019504136762329047, + "loss": 0.8705, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.4995831781161566, + "learning_rate": 0.00019498748541760846, + "loss": 0.8065, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.5536628209320402, + "learning_rate": 0.0001949333195648769, + "loss": 0.7531, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.7264022684471643, + "learning_rate": 0.00019487887022684336, + "loss": 0.9006, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.48996010531672174, + "learning_rate": 0.00019482413756610173, + "loss": 0.7176, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.4986301394800893, + "learning_rate": 0.0001947691217460921, + "loss": 0.7331, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4582507844574082, + "learning_rate": 0.00019471382293110003, + "loss": 0.8075, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.5240490196517137, + "learning_rate": 0.00019465824128625617, + "loss": 0.8127, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.4558191512965942, + "learning_rate": 0.00019460237697753577, + "loss": 0.7936, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4955689981126501, + "learning_rate": 0.00019454623017175812, + "loss": 0.8497, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.43246762551813506, + "learning_rate": 0.00019448980103658613, + "loss": 0.7647, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.47021749367794397, + "learning_rate": 0.0001944330897405257, + "loss": 0.8128, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5542410122264593, + "learning_rate": 0.00019437609645292546, + "loss": 0.8988, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.5027820701759819, + "learning_rate": 0.00019431882134397598, + "loss": 0.8469, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.4448189784336865, + "learning_rate": 0.00019426126458470936, + "loss": 0.7906, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.47925425360609064, + "learning_rate": 0.0001942034263469989, + "loss": 0.8258, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.5208769466787269, + "learning_rate": 0.00019414530680355837, + "loss": 0.8222, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.4154088441354576, + "learning_rate": 0.00019408690612794148, + "loss": 0.7513, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4855637437288716, + "learning_rate": 0.00019402822449454153, + "loss": 0.7959, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.5373917222804588, + "learning_rate": 0.00019396926207859084, + "loss": 0.8802, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.4887205340946534, + "learning_rate": 0.0001939100190561601, + "loss": 0.8318, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5695249330104949, + "learning_rate": 0.00019385049560415794, + "loss": 0.8529, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.4188870625608363, + "learning_rate": 0.0001937906919003304, + "loss": 0.7977, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.4781375918853016, + "learning_rate": 0.00019373060812326052, + "loss": 0.786, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5322759754142233, + "learning_rate": 0.00019367024445236754, + "loss": 0.7719, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.4465537339360222, + "learning_rate": 0.00019360960106790643, + "loss": 0.711, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.4649060332296204, + "learning_rate": 0.0001935486781509677, + "loss": 0.8508, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.47043108444414206, + "learning_rate": 0.00019348747588347637, + "loss": 0.7746, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.6758924041080338, + "learning_rate": 0.00019342599444819168, + "loss": 0.8107, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.4559404927513359, + "learning_rate": 0.00019336423402870653, + "loss": 0.734, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.42429474328097455, + "learning_rate": 0.00019330219480944694, + "loss": 0.7219, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.47327928036650596, + "learning_rate": 0.0001932398769756714, + "loss": 0.883, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.502081625626178, + "learning_rate": 0.0001931772807134704, + "loss": 0.776, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.48026421006742326, + "learning_rate": 0.00019311440620976597, + "loss": 0.8542, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.4264192649481206, + "learning_rate": 0.00019305125365231084, + "loss": 0.7773, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.5041368844234315, + "learning_rate": 0.00019298782322968815, + "loss": 0.7874, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4101983489697829, + "learning_rate": 0.0001929241151313108, + "loss": 0.7619, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.5440978571078002, + "learning_rate": 0.0001928601295474208, + "loss": 0.8104, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.5247219256270258, + "learning_rate": 0.00019279586666908884, + "loss": 0.774, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.6612971325447901, + "learning_rate": 0.00019273132668821364, + "loss": 0.7768, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.5126463491957624, + "learning_rate": 0.00019266650979752136, + "loss": 0.8796, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.6117943304592908, + "learning_rate": 0.00019260141619056507, + "loss": 0.7273, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.7762971712464279, + "learning_rate": 0.00019253604606172417, + "loss": 0.9606, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.47937349115279854, + "learning_rate": 0.0001924703996062038, + "loss": 0.8303, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.4820045733822554, + "learning_rate": 0.0001924044770200342, + "loss": 0.7213, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.4270495896837966, + "learning_rate": 0.00019233827850007027, + "loss": 0.8148, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.3958259756289764, + "learning_rate": 0.0001922718042439908, + "loss": 0.7494, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.45597726469299843, + "learning_rate": 0.000192205054450298, + "loss": 0.8499, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5244564069581154, + "learning_rate": 0.00019213802931831696, + "loss": 0.7156, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.5227787446025458, + "learning_rate": 0.00019207072904819486, + "loss": 0.8284, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.49291342948039957, + "learning_rate": 0.00019200315384090044, + "loss": 0.8306, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.47180595085480614, + "learning_rate": 0.00019193530389822363, + "loss": 0.7799, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.5651113460887758, + "learning_rate": 0.00019186717942277462, + "loss": 0.8706, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.5342570203703145, + "learning_rate": 0.00019179878061798347, + "loss": 0.8137, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4412344364200379, + "learning_rate": 0.00019173010768809933, + "loss": 0.7911, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.4375767139194271, + "learning_rate": 0.00019166116083819002, + "loss": 0.757, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.49772498084372346, + "learning_rate": 0.00019159194027414128, + "loss": 0.7478, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.43884306270757545, + "learning_rate": 0.0001915224462026563, + "loss": 0.7974, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.4529967915347486, + "learning_rate": 0.00019145267883125482, + "loss": 0.783, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.45807911831139486, + "learning_rate": 0.00019138263836827288, + "loss": 0.7645, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.5197256879061048, + "learning_rate": 0.00019131232502286188, + "loss": 0.7813, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.5038334071231932, + "learning_rate": 0.00019124173900498818, + "loss": 0.7599, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.49670102860084564, + "learning_rate": 0.00019117088052543233, + "loss": 0.804, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.468516292536784, + "learning_rate": 0.0001910997497957885, + "loss": 0.8496, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.40283967469213017, + "learning_rate": 0.00019102834702846387, + "loss": 0.7902, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.4296043119455365, + "learning_rate": 0.0001909566724366779, + "loss": 0.734, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4989654907778812, + "learning_rate": 0.00019088472623446183, + "loss": 0.848, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.4705803121651488, + "learning_rate": 0.00019081250863665794, + "loss": 0.8161, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.4228684176978226, + "learning_rate": 0.0001907400198589189, + "loss": 0.7375, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.49457291733883274, + "learning_rate": 0.00019066726011770726, + "loss": 0.8442, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.4407863834288453, + "learning_rate": 0.00019059422963029464, + "loss": 0.7497, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.4301210165073435, + "learning_rate": 0.0001905209286147611, + "loss": 0.707, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.43292672185524217, + "learning_rate": 0.0001904473572899947, + "loss": 0.7048, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.45849806176782326, + "learning_rate": 0.0001903735158756905, + "loss": 0.7985, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.42800637308038647, + "learning_rate": 0.0001902994045923502, + "loss": 0.7615, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.4908767707999642, + "learning_rate": 0.00019022502366128135, + "loss": 0.762, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.4863171414963457, + "learning_rate": 0.0001901503733045967, + "loss": 0.7672, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.517417694309494, + "learning_rate": 0.00019007545374521355, + "loss": 0.8883, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.40528442843383106, + "learning_rate": 0.00019000026520685302, + "loss": 0.7426, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.46897332398261404, + "learning_rate": 0.00018992480791403958, + "loss": 0.7899, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.5277810073048738, + "learning_rate": 0.0001898490820921001, + "loss": 0.7809, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4915143492398682, + "learning_rate": 0.0001897730879671634, + "loss": 0.7902, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.4025228826702654, + "learning_rate": 0.0001896968257661595, + "loss": 0.7379, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.5419896681574701, + "learning_rate": 0.00018962029571681886, + "loss": 0.8238, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.49623969274297897, + "learning_rate": 0.00018954349804767184, + "loss": 0.6811, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.4802001235125401, + "learning_rate": 0.00018946643298804793, + "loss": 0.7658, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.4789639246740269, + "learning_rate": 0.00018938910076807513, + "loss": 0.7643, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.5348953109627322, + "learning_rate": 0.00018931150161867916, + "loss": 0.814, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.41953332867974436, + "learning_rate": 0.0001892336357715829, + "loss": 0.771, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.4245439856312734, + "learning_rate": 0.0001891555034593055, + "loss": 0.7922, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.43877334769248494, + "learning_rate": 0.00018907710491516199, + "loss": 0.8109, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.39656939792093154, + "learning_rate": 0.00018899844037326225, + "loss": 0.7076, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.44555770939906086, + "learning_rate": 0.0001889195100685106, + "loss": 0.7564, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.45431927372201913, + "learning_rate": 0.0001888403142366049, + "loss": 0.79, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.4738456629646632, + "learning_rate": 0.00018876085311403593, + "loss": 0.773, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.4080586869728145, + "learning_rate": 0.00018868112693808665, + "loss": 0.7973, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.40915047638282037, + "learning_rate": 0.00018860113594683148, + "loss": 0.7118, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.43249592315285584, + "learning_rate": 0.00018852088037913577, + "loss": 0.7073, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.4021372609040263, + "learning_rate": 0.0001884403604746547, + "loss": 0.7256, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5175450241514117, + "learning_rate": 0.00018835957647383303, + "loss": 0.8151, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.4532390696629612, + "learning_rate": 0.00018827852861790398, + "loss": 0.7459, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.5160049883677659, + "learning_rate": 0.00018819721714888877, + "loss": 0.7731, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4880252901638021, + "learning_rate": 0.00018811564230959588, + "loss": 0.7964, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.4270119556740443, + "learning_rate": 0.00018803380434362, + "loss": 0.702, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.4839218946470525, + "learning_rate": 0.0001879517034953418, + "loss": 0.8351, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.551660540948947, + "learning_rate": 0.00018786934000992688, + "loss": 0.754, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.45043280403669955, + "learning_rate": 0.00018778671413332513, + "loss": 0.6675, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.44065277624899946, + "learning_rate": 0.00018770382611226987, + "loss": 0.7586, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4327253551488894, + "learning_rate": 0.00018762067619427746, + "loss": 0.727, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.43761925651145245, + "learning_rate": 0.000187537264627646, + "loss": 0.7435, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.4630305498739546, + "learning_rate": 0.00018745359166145523, + "loss": 0.7641, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5328992513303824, + "learning_rate": 0.00018736965754556528, + "loss": 0.7626, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.5098188095859564, + "learning_rate": 0.00018728546253061614, + "loss": 0.7274, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.4889687065223941, + "learning_rate": 0.00018720100686802694, + "loss": 0.8114, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.49009896834756567, + "learning_rate": 0.00018711629080999504, + "loss": 0.8492, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.4447894837155413, + "learning_rate": 0.00018703131460949554, + "loss": 0.7614, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.4130993532940369, + "learning_rate": 0.0001869460785202802, + "loss": 0.7396, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.47012331091275783, + "learning_rate": 0.00018686058279687698, + "loss": 0.8026, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.4929187617955688, + "learning_rate": 0.00018677482769458904, + "loss": 0.8644, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.5990703888599299, + "learning_rate": 0.00018668881346949417, + "loss": 0.8559, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.609077809217428, + "learning_rate": 0.00018660254037844388, + "loss": 0.9045, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.4424878133988063, + "learning_rate": 0.00018651600867906272, + "loss": 0.7656, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.46215474420371894, + "learning_rate": 0.00018642921862974742, + "loss": 0.8207, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.5061061618813556, + "learning_rate": 0.00018634217048966637, + "loss": 0.8528, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.5097362762712476, + "learning_rate": 0.00018625486451875843, + "loss": 0.7266, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.4565147351609864, + "learning_rate": 0.0001861673009777325, + "loss": 0.804, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3847984326162943, + "learning_rate": 0.0001860794801280666, + "loss": 0.7273, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.5152582717564496, + "learning_rate": 0.00018599140223200716, + "loss": 0.8093, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.47173214566980104, + "learning_rate": 0.0001859030675525681, + "loss": 0.7882, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5261921918919323, + "learning_rate": 0.0001858144763535302, + "loss": 0.8545, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.41804637000571243, + "learning_rate": 0.0001857256288994402, + "loss": 0.7672, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.48436297650607635, + "learning_rate": 0.00018563652545561013, + "loss": 0.818, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3965302090838612, + "learning_rate": 0.0001855471662881164, + "loss": 0.6821, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.49489973326421677, + "learning_rate": 0.000185457551663799, + "loss": 0.7431, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.4122066592989025, + "learning_rate": 0.00018536768185026083, + "loss": 0.7383, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.4951284382180182, + "learning_rate": 0.00018527755711586678, + "loss": 0.7821, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.4576128031894911, + "learning_rate": 0.00018518717772974302, + "loss": 0.7218, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.4493219019007947, + "learning_rate": 0.00018509654396177609, + "loss": 0.6925, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4068671936907414, + "learning_rate": 0.00018500565608261214, + "loss": 0.7211, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.5713010081475005, + "learning_rate": 0.00018491451436365627, + "loss": 0.9084, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.42999463693606377, + "learning_rate": 0.0001848231190770714, + "loss": 0.6999, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.49468756297859245, + "learning_rate": 0.00018473147049577774, + "loss": 0.8386, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.5410893554444948, + "learning_rate": 0.00018463956889345194, + "loss": 0.765, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.4546813988185285, + "learning_rate": 0.00018454741454452603, + "loss": 0.8114, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4504507568105852, + "learning_rate": 0.00018445500772418697, + "loss": 0.7781, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.44249447326871505, + "learning_rate": 0.00018436234870837547, + "loss": 0.7737, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.47403384634933343, + "learning_rate": 0.00018426943777378552, + "loss": 0.6962, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.41782602249492984, + "learning_rate": 0.00018417627519786315, + "loss": 0.7021, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.5043278124162154, + "learning_rate": 0.00018408286125880604, + "loss": 0.7627, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.40891139304823176, + "learning_rate": 0.00018398919623556238, + "loss": 0.7153, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.4760123604558327, + "learning_rate": 0.00018389528040783012, + "loss": 0.7927, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.4801579781447219, + "learning_rate": 0.0001838011140560562, + "loss": 0.8051, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.5402948140632149, + "learning_rate": 0.00018370669746143564, + "loss": 0.7613, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5388484457151901, + "learning_rate": 0.00018361203090591071, + "loss": 0.8343, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.5005329170598786, + "learning_rate": 0.0001835171146721701, + "loss": 0.8039, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.5127304530705005, + "learning_rate": 0.00018342194904364813, + "loss": 0.8015, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5606682188930232, + "learning_rate": 0.00018332653430452376, + "loss": 0.7967, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.5731550904456906, + "learning_rate": 0.00018323087073971993, + "loss": 0.8197, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.437099128458472, + "learning_rate": 0.00018313495863490258, + "loss": 0.7945, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4401442760833195, + "learning_rate": 0.00018303879827647975, + "loss": 0.76, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.44568903031232776, + "learning_rate": 0.00018294238995160094, + "loss": 0.7695, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.4662245087633688, + "learning_rate": 0.00018284573394815597, + "loss": 0.7496, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.48524667118673764, + "learning_rate": 0.00018274883055477436, + "loss": 0.7303, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.4858668556114856, + "learning_rate": 0.00018265168006082437, + "loss": 0.7747, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4274364003282721, + "learning_rate": 0.00018255428275641214, + "loss": 0.7606, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.5174559878401154, + "learning_rate": 0.00018245663893238075, + "loss": 0.7883, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.38984437501119923, + "learning_rate": 0.0001823587488803095, + "loss": 0.6835, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.510189489917776, + "learning_rate": 0.00018226061289251298, + "loss": 0.7993, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.46968887006552384, + "learning_rate": 0.00018216223126204007, + "loss": 0.7332, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.4399984189996566, + "learning_rate": 0.00018206360428267332, + "loss": 0.6953, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.4333691148080423, + "learning_rate": 0.00018196473224892784, + "loss": 0.7493, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.41178366424868323, + "learning_rate": 0.00018186561545605054, + "loss": 0.6692, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.44677197753635806, + "learning_rate": 0.0001817662542000192, + "loss": 0.7781, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.43922123270983526, + "learning_rate": 0.0001816666487775416, + "loss": 0.7271, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.5059324491995313, + "learning_rate": 0.00018156679948605467, + "loss": 0.7568, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.42926085514645107, + "learning_rate": 0.00018146670662372354, + "loss": 0.7135, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.5065137362294944, + "learning_rate": 0.0001813663704894407, + "loss": 0.78, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.5253612969729855, + "learning_rate": 0.00018126579138282503, + "loss": 0.8232, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.46219676227828016, + "learning_rate": 0.00018116496960422107, + "loss": 0.6758, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.42711135894094, + "learning_rate": 0.00018106390545469795, + "loss": 0.7147, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.5561434628339826, + "learning_rate": 0.0001809625992360485, + "loss": 0.7576, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.5493160920813105, + "learning_rate": 0.00018086105125078857, + "loss": 0.74, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.4995268309248488, + "learning_rate": 0.00018075926180215576, + "loss": 0.8139, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4433354821933893, + "learning_rate": 0.00018065723119410884, + "loss": 0.8007, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.5357396937903179, + "learning_rate": 0.0001805549597313267, + "loss": 0.7468, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.4331008924018268, + "learning_rate": 0.0001804524477192075, + "loss": 0.7474, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.41132216704825353, + "learning_rate": 0.00018034969546386757, + "loss": 0.7194, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.4438940202038279, + "learning_rate": 0.00018024670327214084, + "loss": 0.8123, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.45245657206075135, + "learning_rate": 0.00018014347145157755, + "loss": 0.7344, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4564352564095824, + "learning_rate": 0.0001800400003104436, + "loss": 0.7917, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.3874291215657798, + "learning_rate": 0.0001799362901577196, + "loss": 0.6891, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.5028289800240948, + "learning_rate": 0.00017983234130309968, + "loss": 0.8464, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5582262301866101, + "learning_rate": 0.00017972815405699103, + "loss": 0.7726, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.42059169501629723, + "learning_rate": 0.00017962372873051252, + "loss": 0.7472, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.41864929349079905, + "learning_rate": 0.00017951906563549397, + "loss": 0.7511, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.4871627192526415, + "learning_rate": 0.00017941416508447536, + "loss": 0.827, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.47436886259604033, + "learning_rate": 0.00017930902739070562, + "loss": 0.7568, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.5140453593325253, + "learning_rate": 0.00017920365286814183, + "loss": 0.8633, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4111140417805916, + "learning_rate": 0.0001790980418314484, + "loss": 0.7271, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.4771323698261575, + "learning_rate": 0.0001789921945959958, + "loss": 0.7806, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.4563126889883716, + "learning_rate": 0.00017888611147786002, + "loss": 0.7908, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.4031187640915982, + "learning_rate": 0.00017877979279382135, + "loss": 0.711, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.44264116877521864, + "learning_rate": 0.00017867323886136348, + "loss": 0.7092, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.5345425982074496, + "learning_rate": 0.00017856644999867264, + "loss": 0.7842, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4161974533856049, + "learning_rate": 0.0001784594265246366, + "loss": 0.6786, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.5142274058224503, + "learning_rate": 0.00017835216875884368, + "loss": 0.7535, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.44854955901430443, + "learning_rate": 0.0001782446770215819, + "loss": 0.8062, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.40994996559176605, + "learning_rate": 0.0001781369516338378, + "loss": 0.7155, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.6256900755515169, + "learning_rate": 0.00017802899291729585, + "loss": 0.8068, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.4660223730621378, + "learning_rate": 0.0001779208011943371, + "loss": 0.863, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.41447768458551026, + "learning_rate": 0.00017781237678803847, + "loss": 0.8082, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.4671048691178654, + "learning_rate": 0.00017770372002217172, + "loss": 0.7543, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.4793035132197613, + "learning_rate": 0.00017759483122120238, + "loss": 0.8101, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4194968825868918, + "learning_rate": 0.000177485710710289, + "loss": 0.734, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.4756278061404401, + "learning_rate": 0.00017737635881528196, + "loss": 0.6976, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.5252695532481464, + "learning_rate": 0.00017726677586272263, + "loss": 0.8406, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.49637457592042844, + "learning_rate": 0.00017715696217984235, + "loss": 0.7017, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.45696393902348287, + "learning_rate": 0.00017704691809456143, + "loss": 0.7684, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.7093100010804253, + "learning_rate": 0.0001769366439354882, + "loss": 0.8497, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.385589836558898, + "learning_rate": 0.00017682614003191807, + "loss": 0.7077, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.4437814175166149, + "learning_rate": 0.00017671540671383243, + "loss": 0.6783, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.5403208355435255, + "learning_rate": 0.0001766044443118978, + "loss": 0.8167, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4777519924737615, + "learning_rate": 0.00017649325315746478, + "loss": 0.7405, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.47455092881601535, + "learning_rate": 0.00017638183358256696, + "loss": 0.7418, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.5078254188770654, + "learning_rate": 0.00017627018591992018, + "loss": 0.8115, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.4848194549707084, + "learning_rate": 0.0001761583105029213, + "loss": 0.8614, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.4199230658226091, + "learning_rate": 0.00017604620766564723, + "loss": 0.6882, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.5610581015624099, + "learning_rate": 0.00017593387774285412, + "loss": 0.7367, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5203286430950974, + "learning_rate": 0.00017582132106997616, + "loss": 0.8513, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.4507578809785903, + "learning_rate": 0.0001757085379831246, + "loss": 0.7752, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.5027680945230145, + "learning_rate": 0.00017559552881908695, + "loss": 0.7688, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.6395933200679313, + "learning_rate": 0.00017548229391532572, + "loss": 0.8296, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.40110869471345095, + "learning_rate": 0.00017536883360997743, + "loss": 0.712, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.48579031734817096, + "learning_rate": 0.00017525514824185185, + "loss": 0.8177, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.43157045704585567, + "learning_rate": 0.00017514123815043074, + "loss": 0.7617, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.3977611529980882, + "learning_rate": 0.00017502710367586687, + "loss": 0.7125, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.38024132887630846, + "learning_rate": 0.0001749127451589832, + "loss": 0.7171, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4706211326108242, + "learning_rate": 0.00017479816294127152, + "loss": 0.7344, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.46613957605230466, + "learning_rate": 0.00017468335736489177, + "loss": 0.7434, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.4322078029947928, + "learning_rate": 0.00017456832877267084, + "loss": 0.7233, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.43238643453268405, + "learning_rate": 0.0001744530775081015, + "loss": 0.7688, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.45474994966479154, + "learning_rate": 0.00017433760391534167, + "loss": 0.8313, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.4251512961228027, + "learning_rate": 0.00017422190833921283, + "loss": 0.7558, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.45536218212756147, + "learning_rate": 0.0001741059911251997, + "loss": 0.7753, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.505119345914215, + "learning_rate": 0.00017398985261944856, + "loss": 0.7423, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.39638892664668407, + "learning_rate": 0.00017387349316876666, + "loss": 0.7193, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5170721093418492, + "learning_rate": 0.000173756913120621, + "loss": 0.7876, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.5830209511644934, + "learning_rate": 0.0001736401128231373, + "loss": 0.7915, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.4250910734443285, + "learning_rate": 0.00017352309262509894, + "loss": 0.7623, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5508648649338562, + "learning_rate": 0.00017340585287594604, + "loss": 0.8407, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.44339150521035003, + "learning_rate": 0.0001732883939257742, + "loss": 0.821, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.4204590356364698, + "learning_rate": 0.0001731707161253338, + "loss": 0.7017, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4266174970093984, + "learning_rate": 0.0001730528198260285, + "loss": 0.7526, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.3686983453315899, + "learning_rate": 0.00017293470537991463, + "loss": 0.7443, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.49394407496261644, + "learning_rate": 0.00017281637313969978, + "loss": 0.8009, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.3921562939510097, + "learning_rate": 0.00017269782345874203, + "loss": 0.727, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.4863703161638804, + "learning_rate": 0.00017257905669104874, + "loss": 0.8435, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.45034045917608595, + "learning_rate": 0.00017246007319127545, + "loss": 0.735, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.47138423288489745, + "learning_rate": 0.00017234087331472497, + "loss": 0.7411, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.46312281124205457, + "learning_rate": 0.00017222145741734626, + "loss": 0.7081, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.43680826598044026, + "learning_rate": 0.00017210182585573327, + "loss": 0.7495, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.5512546016583748, + "learning_rate": 0.00017198197898712404, + "loss": 0.8107, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.3735916418525817, + "learning_rate": 0.00017186191716939944, + "loss": 0.6893, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.7983497181336426, + "learning_rate": 0.0001717416407610824, + "loss": 0.7733, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5070898993355771, + "learning_rate": 0.00017162115012133643, + "loss": 0.8004, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.5557912695238505, + "learning_rate": 0.00017150044560996488, + "loss": 0.8447, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.4265134918906563, + "learning_rate": 0.00017137952758740978, + "loss": 0.7431, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.5131335988369845, + "learning_rate": 0.00017125839641475072, + "loss": 0.6951, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.46778912176756743, + "learning_rate": 0.00017113705245370368, + "loss": 0.8554, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.5839226757599676, + "learning_rate": 0.00017101549606662024, + "loss": 0.8272, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.4292373049276344, + "learning_rate": 0.00017089372761648616, + "loss": 0.7448, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.5027417769836826, + "learning_rate": 0.00017077174746692056, + "loss": 0.8897, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.5143260943645462, + "learning_rate": 0.00017064955598217462, + "loss": 0.8561, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.459333183248125, + "learning_rate": 0.00017052715352713075, + "loss": 0.7571, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.5267442512392976, + "learning_rate": 0.00017040454046730115, + "loss": 0.8453, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.4829723879868959, + "learning_rate": 0.00017028171716882714, + "loss": 0.7506, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.45171016539865366, + "learning_rate": 0.00017015868399847768, + "loss": 0.7365, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.4835600138246436, + "learning_rate": 0.00017003544132364846, + "loss": 0.7528, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.4903456759773783, + "learning_rate": 0.00016991198951236088, + "loss": 0.7437, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4502694723956876, + "learning_rate": 0.00016978832893326074, + "loss": 0.729, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.39570873025854986, + "learning_rate": 0.00016966445995561727, + "loss": 0.6472, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.5791683138724544, + "learning_rate": 0.00016954038294932216, + "loss": 0.851, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.44369166106283936, + "learning_rate": 0.00016941609828488807, + "loss": 0.745, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.47344841197158355, + "learning_rate": 0.0001692916063334479, + "loss": 0.7759, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.4341654595149137, + "learning_rate": 0.0001691669074667535, + "loss": 0.7456, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.47784091229509834, + "learning_rate": 0.0001690420020571747, + "loss": 0.8005, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.4965772109689377, + "learning_rate": 0.0001689168904776979, + "loss": 0.777, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.489350617666454, + "learning_rate": 0.00016879157310192535, + "loss": 0.7874, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3797260577387539, + "learning_rate": 0.0001686660503040737, + "loss": 0.7212, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.719418060615365, + "learning_rate": 0.00016854032245897308, + "loss": 0.8886, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.4122537871991835, + "learning_rate": 0.00016841438994206595, + "loss": 0.6815, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4045742939869129, + "learning_rate": 0.00016828825312940592, + "loss": 0.7244, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.7494132124205607, + "learning_rate": 0.00016816191239765667, + "loss": 0.8476, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.5129079548438056, + "learning_rate": 0.00016803536812409075, + "loss": 0.8489, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4769566008781452, + "learning_rate": 0.0001679086206865886, + "loss": 0.7932, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.464450980037988, + "learning_rate": 0.00016778167046363734, + "loss": 0.7201, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.46167722460127364, + "learning_rate": 0.00016765451783432953, + "loss": 0.7502, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.3993696568375122, + "learning_rate": 0.00016752716317836229, + "loss": 0.7628, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.411043963699614, + "learning_rate": 0.0001673996068760359, + "loss": 0.7087, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.4240472113939744, + "learning_rate": 0.00016727184930825288, + "loss": 0.663, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.5982234250548082, + "learning_rate": 0.0001671438908565167, + "loss": 0.8586, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.3972292074402739, + "learning_rate": 0.00016701573190293077, + "loss": 0.7286, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.401535905669461, + "learning_rate": 0.00016688737283019706, + "loss": 0.6492, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.44233011442238324, + "learning_rate": 0.00016675881402161536, + "loss": 0.7529, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.5054692473679642, + "learning_rate": 0.00016663005586108176, + "loss": 0.7722, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.4791933574646442, + "learning_rate": 0.00016650109873308765, + "loss": 0.7403, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.42779504040868144, + "learning_rate": 0.0001663719430227186, + "loss": 0.7867, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.4512514222879521, + "learning_rate": 0.0001662425891156531, + "loss": 0.7094, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.4229210794064544, + "learning_rate": 0.00016611303739816168, + "loss": 0.7424, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.43432322426676645, + "learning_rate": 0.00016598328825710533, + "loss": 0.7651, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.7159268812283518, + "learning_rate": 0.00016585334207993476, + "loss": 0.8139, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.47140640466596184, + "learning_rate": 0.00016572319925468892, + "loss": 0.704, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4413700493453814, + "learning_rate": 0.000165592860169994, + "loss": 0.7438, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.4648841803809812, + "learning_rate": 0.0001654623252150624, + "loss": 0.7785, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.4807862323463371, + "learning_rate": 0.00016533159477969122, + "loss": 0.7169, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.4512623686501777, + "learning_rate": 0.00016520066925426144, + "loss": 0.6775, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.4350999291313162, + "learning_rate": 0.00016506954902973655, + "loss": 0.7127, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.48905734134433115, + "learning_rate": 0.00016493823449766136, + "loss": 0.7746, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4292773298576345, + "learning_rate": 0.0001648067260501611, + "loss": 0.645, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.5473772978878985, + "learning_rate": 0.00016467502407993992, + "loss": 0.8532, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.4297784283931829, + "learning_rate": 0.0001645431289802799, + "loss": 0.6987, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4535468380483471, + "learning_rate": 0.0001644110411450398, + "loss": 0.8215, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.4997031553280827, + "learning_rate": 0.00016427876096865394, + "loss": 0.7631, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.48418173193860375, + "learning_rate": 0.00016414628884613107, + "loss": 0.6708, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.581101794864796, + "learning_rate": 0.00016401362517305296, + "loss": 0.7584, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.41527374436791076, + "learning_rate": 0.00016388077034557355, + "loss": 0.6795, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.41383128210916, + "learning_rate": 0.00016374772476041748, + "loss": 0.7419, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4498028082700358, + "learning_rate": 0.00016361448881487914, + "loss": 0.7256, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.48312892011273806, + "learning_rate": 0.00016348106290682118, + "loss": 0.8031, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.4363637584208029, + "learning_rate": 0.00016334744743467364, + "loss": 0.7502, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.4380402858113743, + "learning_rate": 0.00016321364279743266, + "loss": 0.7685, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.5372704180044607, + "learning_rate": 0.00016307964939465914, + "loss": 0.8039, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.42058772305209075, + "learning_rate": 0.00016294546762647775, + "loss": 0.6993, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.3985817884467263, + "learning_rate": 0.0001628110978935756, + "loss": 0.7098, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.5548108735480529, + "learning_rate": 0.0001626765405972011, + "loss": 0.8505, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.4963906065238684, + "learning_rate": 0.00016254179613916278, + "loss": 0.7662, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.43676747549593675, + "learning_rate": 0.00016240686492182804, + "loss": 0.7631, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.4401411422093453, + "learning_rate": 0.000162271747348122, + "loss": 0.7322, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.5302779178800815, + "learning_rate": 0.0001621364438215262, + "loss": 0.7402, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.4958846122339328, + "learning_rate": 0.00016200095474607753, + "loss": 0.8676, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.5722256043147971, + "learning_rate": 0.00016186528052636692, + "loss": 0.7983, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.6465608576747207, + "learning_rate": 0.0001617294215675382, + "loss": 0.8204, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.41234277879273545, + "learning_rate": 0.00016159337827528685, + "loss": 0.7872, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.6233071464537084, + "learning_rate": 0.0001614571510558588, + "loss": 0.8753, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.4568477377015119, + "learning_rate": 0.00016132074031604917, + "loss": 0.7846, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.41316253146665566, + "learning_rate": 0.0001611841464632011, + "loss": 0.7075, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.465154997074634, + "learning_rate": 0.00016104736990520468, + "loss": 0.7647, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.39978449383760306, + "learning_rate": 0.0001609104110504954, + "loss": 0.704, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4697537669165921, + "learning_rate": 0.0001607732703080532, + "loss": 0.7608, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.488790150430786, + "learning_rate": 0.00016063594808740113, + "loss": 0.7766, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.3892107195990382, + "learning_rate": 0.00016049844479860422, + "loss": 0.703, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.5189422916788797, + "learning_rate": 0.00016036076085226814, + "loss": 0.8422, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.5430678631199745, + "learning_rate": 0.00016022289665953808, + "loss": 0.7868, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.47630987530323626, + "learning_rate": 0.00016008485263209742, + "loss": 0.7861, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5376800915747006, + "learning_rate": 0.0001599466291821666, + "loss": 0.6851, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.4944054174823598, + "learning_rate": 0.0001598082267225018, + "loss": 0.7422, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.44069188496058753, + "learning_rate": 0.0001596696456663938, + "loss": 0.6973, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.46226494684837316, + "learning_rate": 0.0001595308864276666, + "loss": 0.7538, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.4443950227191254, + "learning_rate": 0.00015939194942067646, + "loss": 0.7399, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.5685552493768595, + "learning_rate": 0.0001592528350603103, + "loss": 0.8605, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.44119141577849835, + "learning_rate": 0.0001591135437619847, + "loss": 0.6903, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.4605611289915192, + "learning_rate": 0.00015897407594164467, + "loss": 0.6843, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.47782950798344975, + "learning_rate": 0.00015883443201576225, + "loss": 0.7513, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.5029707795921143, + "learning_rate": 0.0001586946124013354, + "loss": 0.7272, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.49064029700122663, + "learning_rate": 0.00015855461751588677, + "loss": 0.7726, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.4568333689649895, + "learning_rate": 0.0001584144477774623, + "loss": 0.753, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4835061310434721, + "learning_rate": 0.0001582741036046301, + "loss": 0.7956, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.4586904494238172, + "learning_rate": 0.00015813358541647915, + "loss": 0.7414, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.45267699473776096, + "learning_rate": 0.00015799289363261813, + "loss": 0.7528, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.5006990456048835, + "learning_rate": 0.00015785202867317407, + "loss": 0.7552, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.4420243760214426, + "learning_rate": 0.00015771099095879108, + "loss": 0.7097, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.4630932696592339, + "learning_rate": 0.0001575697809106292, + "loss": 0.8051, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4381474959525617, + "learning_rate": 0.00015742839895036305, + "loss": 0.7712, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.4362637164968971, + "learning_rate": 0.00015728684550018064, + "loss": 0.7134, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.5337956170885394, + "learning_rate": 0.0001571451209827821, + "loss": 0.733, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.42619553958238987, + "learning_rate": 0.00015700322582137827, + "loss": 0.6973, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.45982115164566545, + "learning_rate": 0.00015686116043968972, + "loss": 0.8125, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.5187286023606732, + "learning_rate": 0.00015671892526194516, + "loss": 0.8014, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4030330003792373, + "learning_rate": 0.0001565765207128805, + "loss": 0.7771, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.5213695757622626, + "learning_rate": 0.0001564339472177373, + "loss": 0.8561, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.4430898287378288, + "learning_rate": 0.00015629120520226165, + "loss": 0.6942, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3960093676614589, + "learning_rate": 0.0001561482950927029, + "loss": 0.6703, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.4461093949820488, + "learning_rate": 0.0001560052173158123, + "loss": 0.7926, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.5209486668921074, + "learning_rate": 0.00015586197229884184, + "loss": 0.7327, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.4174835442054574, + "learning_rate": 0.00015571856046954285, + "loss": 0.7323, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5203101606683872, + "learning_rate": 0.00015557498225616487, + "loss": 0.7456, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.43739688753171574, + "learning_rate": 0.0001554312380874542, + "loss": 0.7156, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.47012908154083727, + "learning_rate": 0.00015528732839265272, + "loss": 0.7493, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.48821262555246026, + "learning_rate": 0.00015514325360149668, + "loss": 0.7935, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.5096400845631791, + "learning_rate": 0.0001549990141442153, + "loss": 0.8074, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.5087836103278395, + "learning_rate": 0.0001548546104515294, + "loss": 0.7491, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.40597823511570036, + "learning_rate": 0.00015471004295465035, + "loss": 0.704, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.44189955905098144, + "learning_rate": 0.0001545653120852787, + "loss": 0.7286, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.4094001723854417, + "learning_rate": 0.00015442041827560274, + "loss": 0.6312, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.47703476115864146, + "learning_rate": 0.00015427536195829742, + "loss": 0.8125, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.4131946559039073, + "learning_rate": 0.00015413014356652286, + "loss": 0.6799, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5054724085494187, + "learning_rate": 0.00015398476353392323, + "loss": 0.8006, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.4295937358340469, + "learning_rate": 0.00015383922229462549, + "loss": 0.7929, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.6344466190997674, + "learning_rate": 0.00015369352028323774, + "loss": 0.8001, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.5029154087876303, + "learning_rate": 0.00015354765793484834, + "loss": 0.7009, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.48505083375787206, + "learning_rate": 0.0001534016356850244, + "loss": 0.7636, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.4281213329908672, + "learning_rate": 0.0001532554539698105, + "loss": 0.7538, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.45395160003970775, + "learning_rate": 0.00015310911322572753, + "loss": 0.7168, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.42365833943737535, + "learning_rate": 0.00015296261388977108, + "loss": 0.7059, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.45763802503026313, + "learning_rate": 0.0001528159563994104, + "loss": 0.7389, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.4545983554426908, + "learning_rate": 0.000152669141192587, + "loss": 0.7503, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.3898745640926478, + "learning_rate": 0.00015252216870771345, + "loss": 0.6693, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.5136523474448932, + "learning_rate": 0.00015237503938367186, + "loss": 0.7637, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5659363117917683, + "learning_rate": 0.00015222775365981273, + "loss": 0.8121, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.5060082102672091, + "learning_rate": 0.00015208031197595356, + "loss": 0.7696, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.429466840029395, + "learning_rate": 0.0001519327147723776, + "loss": 0.7262, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4257298981810606, + "learning_rate": 0.00015178496248983254, + "loss": 0.747, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.4204373050848424, + "learning_rate": 0.0001516370555695291, + "loss": 0.7364, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.46223585107318704, + "learning_rate": 0.00015148899445313981, + "loss": 0.7489, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.45422836276114315, + "learning_rate": 0.00015134077958279765, + "loss": 0.7728, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.43489775186795826, + "learning_rate": 0.00015119241140109467, + "loss": 0.7891, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.39069125850805453, + "learning_rate": 0.00015104389035108077, + "loss": 0.7273, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4246531895440103, + "learning_rate": 0.00015089521687626243, + "loss": 0.7387, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.49072827335108293, + "learning_rate": 0.0001507463914206012, + "loss": 0.8379, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.5478280015009428, + "learning_rate": 0.0001505974144285124, + "loss": 0.7375, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.3769981459132424, + "learning_rate": 0.000150448286344864, + "loss": 0.6606, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.48461775989790185, + "learning_rate": 0.00015029900761497506, + "loss": 0.7789, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.48579640510721367, + "learning_rate": 0.00015014957868461458, + "loss": 0.7903, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.49393967471009037, + "learning_rate": 0.00015000000000000001, + "loss": 0.6961, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.4112538399474424, + "learning_rate": 0.000149850272007796, + "loss": 0.7437, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.5103340356646984, + "learning_rate": 0.00014970039515511304, + "loss": 0.7634, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4340726265092865, + "learning_rate": 0.00014955036988950618, + "loss": 0.737, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.5477988532675317, + "learning_rate": 0.0001494001966589736, + "loss": 0.7635, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.49762428997080593, + "learning_rate": 0.00014924987591195547, + "loss": 0.7205, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4617320557674953, + "learning_rate": 0.00014909940809733222, + "loss": 0.7369, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.41210155221307404, + "learning_rate": 0.0001489487936644237, + "loss": 0.6681, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.4910795134552265, + "learning_rate": 0.00014879803306298736, + "loss": 0.8081, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5213128333185546, + "learning_rate": 0.00014864712674321734, + "loss": 0.829, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.47921788570624435, + "learning_rate": 0.00014849607515574276, + "loss": 0.7583, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.43595138360007896, + "learning_rate": 0.00014834487875162657, + "loss": 0.76, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.42735307273006445, + "learning_rate": 0.00014819353798236427, + "loss": 0.6781, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.39006746091488365, + "learning_rate": 0.00014804205329988225, + "loss": 0.7359, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.5310398949825308, + "learning_rate": 0.00014789042515653687, + "loss": 0.6893, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.46443101262094777, + "learning_rate": 0.00014773865400511272, + "loss": 0.7285, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.5146034622929456, + "learning_rate": 0.00014758674029882152, + "loss": 0.8202, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.4348436928102778, + "learning_rate": 0.00014743468449130063, + "loss": 0.7602, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4432613163819088, + "learning_rate": 0.00014728248703661182, + "loss": 0.7545, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.42553157744987713, + "learning_rate": 0.00014713014838923976, + "loss": 0.7512, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.4699396848331131, + "learning_rate": 0.00014697766900409074, + "loss": 0.7622, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.40606209684153594, + "learning_rate": 0.00014682504933649144, + "loss": 0.6484, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.4355074989092574, + "learning_rate": 0.0001466722898421873, + "loss": 0.7134, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.46703525087855696, + "learning_rate": 0.0001465193909773413, + "loss": 0.7999, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.4664099267583723, + "learning_rate": 0.00014636635319853275, + "loss": 0.8064, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.5140904956668334, + "learning_rate": 0.00014621317696275564, + "loss": 0.7683, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.4408781952457787, + "learning_rate": 0.00014605986272741748, + "loss": 0.7514, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.4800514398310778, + "learning_rate": 0.00014590641095033787, + "loss": 0.7411, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.4211586384557557, + "learning_rate": 0.00014575282208974702, + "loss": 0.7155, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.421653090437245, + "learning_rate": 0.00014559909660428468, + "loss": 0.7042, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3904998722681181, + "learning_rate": 0.00014544523495299842, + "loss": 0.6913, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.49633206000997504, + "learning_rate": 0.00014529123759534255, + "loss": 0.7718, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.38695000482239966, + "learning_rate": 0.00014513710499117647, + "loss": 0.6648, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4846438208269051, + "learning_rate": 0.0001449828376007636, + "loss": 0.729, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.4988732653784483, + "learning_rate": 0.00014482843588476974, + "loss": 0.7105, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.3890334427781973, + "learning_rate": 0.00014467390030426186, + "loss": 0.7642, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.5015995117011884, + "learning_rate": 0.0001445192313207067, + "loss": 0.7951, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.4161455709852972, + "learning_rate": 0.0001443644293959693, + "loss": 0.7556, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.3986358064231693, + "learning_rate": 0.00014420949499231172, + "loss": 0.6742, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4231105241423163, + "learning_rate": 0.0001440544285723915, + "loss": 0.7393, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.4422640127013368, + "learning_rate": 0.00014389923059926062, + "loss": 0.7435, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.43021573691414106, + "learning_rate": 0.0001437439015363638, + "loss": 0.759, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.42485595541853743, + "learning_rate": 0.00014358844184753712, + "loss": 0.7528, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.40479690878576835, + "learning_rate": 0.00014343285199700683, + "loss": 0.7608, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.4932163955361533, + "learning_rate": 0.0001432771324493879, + "loss": 0.7164, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.42200867156661387, + "learning_rate": 0.00014312128366968243, + "loss": 0.6774, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.5467399094318625, + "learning_rate": 0.00014296530612327863, + "loss": 0.8899, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.4169895054651297, + "learning_rate": 0.00014280920027594907, + "loss": 0.7594, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.3938157519829676, + "learning_rate": 0.00014265296659384956, + "loss": 0.6852, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.48117155195670075, + "learning_rate": 0.00014249660554351752, + "loss": 0.7268, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.4522316546093445, + "learning_rate": 0.00014234011759187083, + "loss": 0.7072, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4434853759843272, + "learning_rate": 0.00014218350320620624, + "loss": 0.7224, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.3982148416540636, + "learning_rate": 0.00014202676285419812, + "loss": 0.6335, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.48780884007878533, + "learning_rate": 0.00014186989700389687, + "loss": 0.678, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.3942170795871845, + "learning_rate": 0.0001417129061237278, + "loss": 0.6621, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.37315975023349823, + "learning_rate": 0.0001415557906824895, + "loss": 0.6799, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.4804017797780288, + "learning_rate": 0.00014139855114935252, + "loss": 0.677, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.4866777828183144, + "learning_rate": 0.00014124118799385796, + "loss": 0.7712, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.48112597267634266, + "learning_rate": 0.0001410837016859161, + "loss": 0.7361, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.49039545768870146, + "learning_rate": 0.00014092609269580496, + "loss": 0.7098, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4156820194805481, + "learning_rate": 0.00014076836149416887, + "loss": 0.7215, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.449495212879705, + "learning_rate": 0.00014061050855201723, + "loss": 0.7859, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.5517682180192388, + "learning_rate": 0.0001404525343407228, + "loss": 0.738, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4078107633758721, + "learning_rate": 0.0001402944393320206, + "loss": 0.7428, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.3848257011546897, + "learning_rate": 0.00014013622399800627, + "loss": 0.6847, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.4705304268494785, + "learning_rate": 0.00013997788881113489, + "loss": 0.6901, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.45351375128922244, + "learning_rate": 0.00013981943424421932, + "loss": 0.697, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.4206152695860078, + "learning_rate": 0.0001396608607704289, + "loss": 0.7215, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.4872622191747159, + "learning_rate": 0.0001395021688632882, + "loss": 0.783, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.42120062669574676, + "learning_rate": 0.00013934335899667527, + "loss": 0.7548, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.42322770575160906, + "learning_rate": 0.00013918443164482046, + "loss": 0.7057, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.43128822409554385, + "learning_rate": 0.000139025387282305, + "loss": 0.6595, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.4192929353710216, + "learning_rate": 0.00013886622638405952, + "loss": 0.7343, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.413275458081682, + "learning_rate": 0.0001387069494253626, + "loss": 0.6916, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.3690774046989975, + "learning_rate": 0.0001385475568818394, + "loss": 0.7141, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4617461073357319, + "learning_rate": 0.00013838804922946027, + "loss": 0.7592, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.4893411296699691, + "learning_rate": 0.00013822842694453924, + "loss": 0.7884, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.48910757120341053, + "learning_rate": 0.0001380686905037327, + "loss": 0.815, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.49374593997669053, + "learning_rate": 0.00013790884038403795, + "loss": 0.7179, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.44571095106319375, + "learning_rate": 0.00013774887706279165, + "loss": 0.7098, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.41530745351569454, + "learning_rate": 0.0001375888010176686, + "loss": 0.7024, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.5525773689172044, + "learning_rate": 0.00013742861272668012, + "loss": 0.8158, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.5170291279150868, + "learning_rate": 0.00013726831266817278, + "loss": 0.7695, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.45091419541150624, + "learning_rate": 0.00013710790132082692, + "loss": 0.7352, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.4879611538440513, + "learning_rate": 0.00013694737916365517, + "loss": 0.7401, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.377411463980635, + "learning_rate": 0.00013678674667600102, + "loss": 0.6367, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.39877682093200306, + "learning_rate": 0.00013662600433753745, + "loss": 0.6868, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.5110745295985535, + "learning_rate": 0.00013646515262826552, + "loss": 0.8137, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.557627149290855, + "learning_rate": 0.00013630419202851284, + "loss": 0.7981, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.4785536029438291, + "learning_rate": 0.00013614312301893223, + "loss": 0.7433, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3950924824169307, + "learning_rate": 0.0001359819460805001, + "loss": 0.6917, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.38368541292649505, + "learning_rate": 0.00013582066169451535, + "loss": 0.7066, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.43192553227021735, + "learning_rate": 0.0001356592703425976, + "loss": 0.7538, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4508907660187954, + "learning_rate": 0.0001354977725066859, + "loss": 0.7094, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.45563511717824373, + "learning_rate": 0.00013533616866903735, + "loss": 0.7877, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.4151235291365336, + "learning_rate": 0.0001351744593122255, + "loss": 0.6721, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4748985553018345, + "learning_rate": 0.00013501264491913906, + "loss": 0.7979, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.4291758260327782, + "learning_rate": 0.00013485072597298038, + "loss": 0.6887, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.4853152567669402, + "learning_rate": 0.00013468870295726398, + "loss": 0.667, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.37695781697999237, + "learning_rate": 0.0001345265763558152, + "loss": 0.679, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.4145462694391789, + "learning_rate": 0.00013436434665276865, + "loss": 0.7507, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.4762167954960014, + "learning_rate": 0.00013420201433256689, + "loss": 0.7404, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.5049455761457096, + "learning_rate": 0.00013403957987995882, + "loss": 0.7763, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.4188665914477433, + "learning_rate": 0.00013387704377999842, + "loss": 0.7405, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.412248135828123, + "learning_rate": 0.00013371440651804313, + "loss": 0.7389, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.48039414957295357, + "learning_rate": 0.0001335516685797525, + "loss": 0.7428, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.3624702058083208, + "learning_rate": 0.00013338883045108674, + "loss": 0.7249, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.5636795345502021, + "learning_rate": 0.00013322589261830517, + "loss": 0.8385, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4607953135742836, + "learning_rate": 0.00013306285556796495, + "loss": 0.756, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.376729193046304, + "learning_rate": 0.0001328997197869194, + "loss": 0.6614, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.3991667630290107, + "learning_rate": 0.0001327364857623168, + "loss": 0.6909, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.47573220822123935, + "learning_rate": 0.00013257315398159864, + "loss": 0.7151, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.4588884817829327, + "learning_rate": 0.00013240972493249847, + "loss": 0.748, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.5163011613357176, + "learning_rate": 0.0001322461991030402, + "loss": 0.7619, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.5202514115332239, + "learning_rate": 0.00013208257698153677, + "loss": 0.7711, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.42017813629927897, + "learning_rate": 0.00013191885905658872, + "loss": 0.6914, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.49298713784841597, + "learning_rate": 0.0001317550458170826, + "loss": 0.7873, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.4211312472740005, + "learning_rate": 0.00013159113775218964, + "loss": 0.6849, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.46257868165210364, + "learning_rate": 0.00013142713535136414, + "loss": 0.7719, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.44366844894252566, + "learning_rate": 0.00013126303910434214, + "loss": 0.7104, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.45552496956208255, + "learning_rate": 0.00013109884950114007, + "loss": 0.8097, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.5078343665730175, + "learning_rate": 0.00013093456703205288, + "loss": 0.741, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.4461778897832065, + "learning_rate": 0.00013077019218765305, + "loss": 0.7362, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.5357241749772622, + "learning_rate": 0.00013060572545878875, + "loss": 0.7631, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.46927262105236117, + "learning_rate": 0.0001304411673365826, + "loss": 0.799, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.4328759143850609, + "learning_rate": 0.0001302765183124302, + "loss": 0.71, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.5007509397077923, + "learning_rate": 0.00013011177887799845, + "loss": 0.7574, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.4416348515451264, + "learning_rate": 0.00012994694952522435, + "loss": 0.7459, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.4698835680868725, + "learning_rate": 0.00012978203074631334, + "loss": 0.7342, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.50252881148867, + "learning_rate": 0.00012961702303373795, + "loss": 0.7906, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.47604069277245004, + "learning_rate": 0.00012945192688023624, + "loss": 0.7896, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.5366904106513515, + "learning_rate": 0.0001292867427788104, + "loss": 0.7475, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.4977277201391843, + "learning_rate": 0.00012912147122272523, + "loss": 0.74, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.5004320754854158, + "learning_rate": 0.00012895611270550666, + "loss": 0.7675, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.3497844453998508, + "learning_rate": 0.0001287906677209403, + "loss": 0.6345, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.43523023207752554, + "learning_rate": 0.00012862513676307008, + "loss": 0.7434, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.5418558846172636, + "learning_rate": 0.0001284595203261965, + "loss": 0.8334, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.47654734118721276, + "learning_rate": 0.00012829381890487536, + "loss": 0.7781, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3870351813154318, + "learning_rate": 0.00012812803299391628, + "loss": 0.674, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.4461402164522316, + "learning_rate": 0.00012796216308838117, + "loss": 0.7324, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.41058692356759663, + "learning_rate": 0.00012779620968358273, + "loss": 0.7587, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4618492446489887, + "learning_rate": 0.00012763017327508305, + "loss": 0.7244, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.36974323409460746, + "learning_rate": 0.00012746405435869198, + "loss": 0.6838, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.45087404920245155, + "learning_rate": 0.00012729785343046588, + "loss": 0.7834, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.4579058796264781, + "learning_rate": 0.0001271315709867059, + "loss": 0.7209, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.47498833143544217, + "learning_rate": 0.00012696520752395672, + "loss": 0.8021, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.45453279197662866, + "learning_rate": 0.00012679876353900482, + "loss": 0.7371, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.47827951125770113, + "learning_rate": 0.00012663223952887723, + "loss": 0.7593, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.9472114532586452, + "learning_rate": 0.00012646563599083996, + "loss": 0.7698, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.4051855162278431, + "learning_rate": 0.00012629895342239643, + "loss": 0.7284, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4255203306929849, + "learning_rate": 0.00012613219232128608, + "loss": 0.6836, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.444687852998819, + "learning_rate": 0.00012596535318548289, + "loss": 0.748, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.41768292366931153, + "learning_rate": 0.0001257984365131938, + "loss": 0.7089, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.5861615055236309, + "learning_rate": 0.00012563144280285741, + "loss": 0.8613, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.44600652403455104, + "learning_rate": 0.00012546437255314222, + "loss": 0.7116, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.43336271970583384, + "learning_rate": 0.0001252972262629454, + "loss": 0.6975, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.48013857861997417, + "learning_rate": 0.00012513000443139112, + "loss": 0.7971, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.5034434513967145, + "learning_rate": 0.00012496270755782914, + "loss": 0.7113, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.3948595412837032, + "learning_rate": 0.00012479533614183334, + "loss": 0.6594, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4640635824991028, + "learning_rate": 0.00012462789068320017, + "loss": 0.7221, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.470648685182896, + "learning_rate": 0.00012446037168194714, + "loss": 0.7356, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.46249832108832006, + "learning_rate": 0.00012429277963831148, + "loss": 0.7244, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.6054596781772245, + "learning_rate": 0.00012412511505274844, + "loss": 0.8706, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.46149602091097064, + "learning_rate": 0.00012395737842592995, + "loss": 0.7706, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.473517667257064, + "learning_rate": 0.000123789570258743, + "loss": 0.8551, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.40140203364399346, + "learning_rate": 0.00012362169105228826, + "loss": 0.647, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.37806047932029274, + "learning_rate": 0.00012345374130787854, + "loss": 0.6263, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.43531713509041514, + "learning_rate": 0.00012328572152703725, + "loss": 0.7189, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5729951222509617, + "learning_rate": 0.000123117632211497, + "loss": 0.7874, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.48094925653615717, + "learning_rate": 0.00012294947386319794, + "loss": 0.7358, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.4160815675654382, + "learning_rate": 0.0001227812469842864, + "loss": 0.7087, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4012351762963187, + "learning_rate": 0.00012261295207711346, + "loss": 0.6723, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.5005619939623338, + "learning_rate": 0.00012244458964423327, + "loss": 0.7039, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.4289880721438666, + "learning_rate": 0.00012227616018840154, + "loss": 0.7353, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4623424281862036, + "learning_rate": 0.0001221076642125742, + "loss": 0.7243, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.5238256798151496, + "learning_rate": 0.00012193910221990581, + "loss": 0.6481, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.4920251365004284, + "learning_rate": 0.00012177047471374807, + "loss": 0.7208, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.47212264759512007, + "learning_rate": 0.00012160178219764837, + "loss": 0.7468, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.4570252945087352, + "learning_rate": 0.0001214330251753481, + "loss": 0.6692, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.4544825791391424, + "learning_rate": 0.00012126420415078132, + "loss": 0.7347, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3866722439233, + "learning_rate": 0.00012109531962807332, + "loss": 0.6801, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.434025092111616, + "learning_rate": 0.00012092637211153885, + "loss": 0.6937, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.4873335745190174, + "learning_rate": 0.0001207573621056809, + "loss": 0.7189, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4059140162354565, + "learning_rate": 0.00012058829011518896, + "loss": 0.7184, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.5122139601061256, + "learning_rate": 0.00012041915664493761, + "loss": 0.7952, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.4153275678011465, + "learning_rate": 0.00012024996219998517, + "loss": 0.7255, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.47828235983973083, + "learning_rate": 0.00012008070728557186, + "loss": 0.7161, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.3917227166614342, + "learning_rate": 0.00011991139240711857, + "loss": 0.7079, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.5210383767114197, + "learning_rate": 0.00011974201807022525, + "loss": 0.7076, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.42984081870006136, + "learning_rate": 0.00011957258478066931, + "loss": 0.7429, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.3743061187411529, + "learning_rate": 0.00011940309304440433, + "loss": 0.6726, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.4448355922229668, + "learning_rate": 0.00011923354336755835, + "loss": 0.6941, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.4721939581213205, + "learning_rate": 0.00011906393625643244, + "loss": 0.6726, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.4422985194716223, + "learning_rate": 0.00011889427221749916, + "loss": 0.7306, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.42105536963702145, + "learning_rate": 0.00011872455175740112, + "loss": 0.7131, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3659156762371082, + "learning_rate": 0.00011855477538294935, + "loss": 0.7031, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.4665359443232002, + "learning_rate": 0.00011838494360112185, + "loss": 0.7055, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.45621341040450175, + "learning_rate": 0.00011821505691906216, + "loss": 0.7362, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.436441349809642, + "learning_rate": 0.00011804511584407763, + "loss": 0.7538, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.4508971403898046, + "learning_rate": 0.00011787512088363817, + "loss": 0.7453, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.4269259836628523, + "learning_rate": 0.00011770507254537453, + "loss": 0.7214, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4459383523611653, + "learning_rate": 0.00011753497133707679, + "loss": 0.6567, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.4465978200420246, + "learning_rate": 0.00011736481776669306, + "loss": 0.7387, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.42088728726217783, + "learning_rate": 0.00011719461234232764, + "loss": 0.6753, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4800131441916319, + "learning_rate": 0.00011702435557223987, + "loss": 0.7005, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.39495859931250377, + "learning_rate": 0.00011685404796484225, + "loss": 0.684, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.38666835949944633, + "learning_rate": 0.00011668369002869912, + "loss": 0.6587, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.3785771871608222, + "learning_rate": 0.00011651328227252517, + "loss": 0.7127, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.4997168522784999, + "learning_rate": 0.00011634282520518383, + "loss": 0.7406, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.4552535133442266, + "learning_rate": 0.00011617231933568578, + "loss": 0.7027, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4281849731866267, + "learning_rate": 0.00011600176517318741, + "loss": 0.7355, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.41856303836706704, + "learning_rate": 0.00011583116322698935, + "loss": 0.7173, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.41641475823433494, + "learning_rate": 0.00011566051400653486, + "loss": 0.6546, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.41990224100948037, + "learning_rate": 0.00011548981802140848, + "loss": 0.732, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.4568532612356803, + "learning_rate": 0.00011531907578133429, + "loss": 0.7081, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.49956252479186747, + "learning_rate": 0.00011514828779617459, + "loss": 0.7362, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.5600248883179616, + "learning_rate": 0.00011497745457592816, + "loss": 0.7427, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.39916392653079935, + "learning_rate": 0.00011480657663072896, + "loss": 0.67, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.4288357738907657, + "learning_rate": 0.00011463565447084445, + "loss": 0.725, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.45418123549236744, + "learning_rate": 0.00011446468860667421, + "loss": 0.649, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.4903675423822203, + "learning_rate": 0.00011429367954874819, + "loss": 0.7349, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.6131789191291735, + "learning_rate": 0.0001141226278077254, + "loss": 0.7716, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.4309385945084161, + "learning_rate": 0.00011395153389439233, + "loss": 0.7158, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.435760732981933, + "learning_rate": 0.00011378039831966134, + "loss": 0.7038, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.5564258754454618, + "learning_rate": 0.00011360922159456928, + "loss": 0.6625, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.44328423885398677, + "learning_rate": 0.00011343800423027582, + "loss": 0.6896, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.48030019973891724, + "learning_rate": 0.00011326674673806195, + "loss": 0.7395, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.5051613444338655, + "learning_rate": 0.00011309544962932862, + "loss": 0.8078, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4573909168100557, + "learning_rate": 0.0001129241134155949, + "loss": 0.7745, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.4401736616959906, + "learning_rate": 0.00011275273860849684, + "loss": 0.7356, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.5004522458958894, + "learning_rate": 0.00011258132571978555, + "loss": 0.7012, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.5228492610992537, + "learning_rate": 0.00011240987526132594, + "loss": 0.7444, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.3984380461729751, + "learning_rate": 0.00011223838774509514, + "loss": 0.7443, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.4965680892197268, + "learning_rate": 0.00011206686368318086, + "loss": 0.6394, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.4179899100906879, + "learning_rate": 0.00011189530358778005, + "loss": 0.6331, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.4485835346576027, + "learning_rate": 0.00011172370797119712, + "loss": 0.7241, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.44535951940755986, + "learning_rate": 0.00011155207734584263, + "loss": 0.7378, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.41528059437441167, + "learning_rate": 0.00011138041222423177, + "loss": 0.7482, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.4812608069912455, + "learning_rate": 0.00011120871311898254, + "loss": 0.7707, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.6298099451387288, + "learning_rate": 0.0001110369805428146, + "loss": 0.7764, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.49564263943530495, + "learning_rate": 0.00011086521500854745, + "loss": 0.714, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.46763160567626094, + "learning_rate": 0.0001106934170290991, + "loss": 0.7506, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.4166158646410712, + "learning_rate": 0.00011052158711748434, + "loss": 0.6823, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4752311001226654, + "learning_rate": 0.00011034972578681338, + "loss": 0.7637, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.5010835968981501, + "learning_rate": 0.00011017783355029026, + "loss": 0.7361, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.39733938852446354, + "learning_rate": 0.00011000591092121127, + "loss": 0.7173, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.39211339428632275, + "learning_rate": 0.00010983395841296348, + "loss": 0.7257, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.5105825628365223, + "learning_rate": 0.0001096619765390232, + "loss": 0.8402, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.5193039573985059, + "learning_rate": 0.00010948996581295436, + "loss": 0.7425, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3862603973124171, + "learning_rate": 0.00010931792674840718, + "loss": 0.709, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.45300153174614916, + "learning_rate": 0.00010914585985911632, + "loss": 0.6598, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.44820320157660765, + "learning_rate": 0.00010897376565889971, + "loss": 0.6945, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.3997412603202052, + "learning_rate": 0.00010880164466165674, + "loss": 0.7067, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.40947983018857464, + "learning_rate": 0.00010862949738136681, + "loss": 0.6708, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.4754101993177675, + "learning_rate": 0.00010845732433208779, + "loss": 0.6983, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.44950607717307856, + "learning_rate": 0.00010828512602795462, + "loss": 0.8272, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.38602995140251695, + "learning_rate": 0.00010811290298317755, + "loss": 0.6056, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.4393206083327871, + "learning_rate": 0.00010794065571204072, + "loss": 0.6795, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.47478309312233763, + "learning_rate": 0.00010776838472890065, + "loss": 0.76, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.47034182707880096, + "learning_rate": 0.00010759609054818458, + "loss": 0.7437, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.48083147996893855, + "learning_rate": 0.00010742377368438914, + "loss": 0.6545, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4336982554683627, + "learning_rate": 0.00010725143465207867, + "loss": 0.6859, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.4207220668613276, + "learning_rate": 0.00010707907396588361, + "loss": 0.6818, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.3748167742989246, + "learning_rate": 0.0001069066921404992, + "loss": 0.6447, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5050791445432369, + "learning_rate": 0.00010673428969068364, + "loss": 0.741, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.47384529502461137, + "learning_rate": 0.00010656186713125689, + "loss": 0.7561, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.36608612365026283, + "learning_rate": 0.0001063894249770989, + "loss": 0.6721, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.4468455436512149, + "learning_rate": 0.00010621696374314807, + "loss": 0.6718, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.43522896013997575, + "learning_rate": 0.00010604448394439983, + "loss": 0.7471, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.4320446808545491, + "learning_rate": 0.00010587198609590505, + "loss": 0.7346, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.39291534306609155, + "learning_rate": 0.00010569947071276847, + "loss": 0.6664, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.40501450732134475, + "learning_rate": 0.00010552693831014726, + "loss": 0.6477, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.47059878387198506, + "learning_rate": 0.0001053543894032493, + "loss": 0.7557, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4455770934005571, + "learning_rate": 0.00010518182450733186, + "loss": 0.6767, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.4939088817221293, + "learning_rate": 0.00010500924413769988, + "loss": 0.6729, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.4964218357696382, + "learning_rate": 0.00010483664880970457, + "loss": 0.7617, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4608211700887765, + "learning_rate": 0.00010466403903874176, + "loss": 0.7729, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.42795111113050455, + "learning_rate": 0.00010449141534025045, + "loss": 0.7197, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.5007745897689231, + "learning_rate": 0.00010431877822971117, + "loss": 0.7102, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.6999720720846241, + "learning_rate": 0.00010414612822264455, + "loss": 0.8264, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.46304362298689367, + "learning_rate": 0.00010397346583460971, + "loss": 0.7887, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.3985616879181766, + "learning_rate": 0.0001038007915812028, + "loss": 0.7189, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.45627600783763306, + "learning_rate": 0.00010362810597805526, + "loss": 0.6619, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.356584407952678, + "learning_rate": 0.0001034554095408326, + "loss": 0.6606, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.43577340520676033, + "learning_rate": 0.00010328270278523256, + "loss": 0.7142, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4733156893422998, + "learning_rate": 0.0001031099862269837, + "loss": 0.728, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.42808444154564773, + "learning_rate": 0.00010293726038184393, + "loss": 0.7154, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.5007068163829674, + "learning_rate": 0.00010276452576559879, + "loss": 0.7969, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4130841971473934, + "learning_rate": 0.00010259178289406011, + "loss": 0.6141, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.4353374083452603, + "learning_rate": 0.00010241903228306431, + "loss": 0.7204, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.4366447906638421, + "learning_rate": 0.0001022462744484709, + "loss": 0.6752, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4136216912670512, + "learning_rate": 0.00010207350990616107, + "loss": 0.6573, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.36380003077884254, + "learning_rate": 0.00010190073917203589, + "loss": 0.6799, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.38066318132601257, + "learning_rate": 0.00010172796276201503, + "loss": 0.6963, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.43640031931792256, + "learning_rate": 0.0001015551811920351, + "loss": 0.745, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.4217155080690035, + "learning_rate": 0.00010138239497804804, + "loss": 0.6891, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.5124061787151392, + "learning_rate": 0.00010120960463601976, + "loss": 0.7634, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.5316982626641302, + "learning_rate": 0.00010103681068192845, + "loss": 0.7308, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.3964336531410184, + "learning_rate": 0.00010086401363176305, + "loss": 0.6524, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.44770223191113395, + "learning_rate": 0.00010069121400152181, + "loss": 0.7057, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.400433927645052, + "learning_rate": 0.00010051841230721065, + "loss": 0.6621, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.41862647916803847, + "learning_rate": 0.0001003456090648416, + "loss": 0.6665, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.4012182700945775, + "learning_rate": 0.00010017280479043147, + "loss": 0.6907, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.42947453081849973, + "learning_rate": 0.0001, + "loss": 0.723, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.41970579172523287, + "learning_rate": 9.982719520956855e-05, + "loss": 0.6791, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.3714481537478899, + "learning_rate": 9.965439093515841e-05, + "loss": 0.6698, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.5237501849426753, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7405, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.47493042786920125, + "learning_rate": 9.930878599847821e-05, + "loss": 0.7338, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.4189611336365823, + "learning_rate": 9.913598636823693e-05, + "loss": 0.7938, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.45444657911733427, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6477, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.37069022687128605, + "learning_rate": 9.879039536398024e-05, + "loss": 0.6885, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.3762677879935244, + "learning_rate": 9.861760502195197e-05, + "loss": 0.672, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.6230312694472444, + "learning_rate": 9.844481880796491e-05, + "loss": 0.794, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.5535522560335289, + "learning_rate": 9.827203723798498e-05, + "loss": 0.7513, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.41518575881430275, + "learning_rate": 9.809926082796415e-05, + "loss": 0.7236, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5231666406971717, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7724, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.4375656625124024, + "learning_rate": 9.775372555152912e-05, + "loss": 0.7707, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.4372291586618811, + "learning_rate": 9.758096771693573e-05, + "loss": 0.6907, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4322224825025404, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6954, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.4069345600036601, + "learning_rate": 9.723547423440122e-05, + "loss": 0.6508, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.5586441490091297, + "learning_rate": 9.70627396181561e-05, + "loss": 0.72, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.47394620352062794, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7661, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.44740168580886186, + "learning_rate": 9.671729721476746e-05, + "loss": 0.6853, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.4475730347483736, + "learning_rate": 9.654459045916743e-05, + "loss": 0.6968, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4220184235511863, + "learning_rate": 9.637189402194476e-05, + "loss": 0.714, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.4725389504679414, + "learning_rate": 9.619920841879725e-05, + "loss": 0.7731, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.42852303235911554, + "learning_rate": 9.602653416539031e-05, + "loss": 0.714, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.43963207443504576, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7638, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.39489295936510793, + "learning_rate": 9.568122177028884e-05, + "loss": 0.6538, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.43974009060802266, + "learning_rate": 9.550858465974958e-05, + "loss": 0.7697, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.37811439525893237, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6487, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.5005279798935339, + "learning_rate": 9.516335119029546e-05, + "loss": 0.7951, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.43524518191637235, + "learning_rate": 9.499075586230013e-05, + "loss": 0.7264, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.45685119847738254, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7069, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.4332628472237465, + "learning_rate": 9.464561059675073e-05, + "loss": 0.6838, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.41200605772690635, + "learning_rate": 9.44730616898528e-05, + "loss": 0.6618, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4271296345910832, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6848, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4185761458588467, + "learning_rate": 9.412801390409497e-05, + "loss": 0.7216, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.5264701446142168, + "learning_rate": 9.395551605560018e-05, + "loss": 0.6779, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4020877365544272, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6837, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.5384160472172311, + "learning_rate": 9.361057502290113e-05, + "loss": 0.7121, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.5213296396932801, + "learning_rate": 9.343813286874312e-05, + "loss": 0.7804, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.5270813891162284, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7956, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.49575141836996933, + "learning_rate": 9.309330785950086e-05, + "loss": 0.6717, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.42591287362664304, + "learning_rate": 9.292092603411641e-05, + "loss": 0.7224, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.5725826530948611, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7225, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.45388403632573054, + "learning_rate": 9.257622631561085e-05, + "loss": 0.6919, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.3771585033880052, + "learning_rate": 9.240390945181543e-05, + "loss": 0.6982, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.38059835446466583, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7109, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.4437724395723168, + "learning_rate": 9.205934428795929e-05, + "loss": 0.6903, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.5457205907235986, + "learning_rate": 9.188709701682247e-05, + "loss": 0.7525, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4916490505433348, + "learning_rate": 9.171487397204539e-05, + "loss": 0.703, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.4120828334760189, + "learning_rate": 9.154267566791223e-05, + "loss": 0.6729, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.5899202363159813, + "learning_rate": 9.137050261863324e-05, + "loss": 0.7549, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.40318575488783587, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6565, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.40673107079296383, + "learning_rate": 9.102623434110028e-05, + "loss": 0.6789, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.4498824076995007, + "learning_rate": 9.085414014088369e-05, + "loss": 0.6853, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.4787445125107849, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6575, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.4505039428482767, + "learning_rate": 9.051003418704565e-05, + "loss": 0.737, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.4148893071953553, + "learning_rate": 9.033802346097682e-05, + "loss": 0.7067, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.40547430757080405, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7131, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.45950253100425287, + "learning_rate": 8.999408907878877e-05, + "loss": 0.6953, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.45380750671363423, + "learning_rate": 8.982216644970979e-05, + "loss": 0.7219, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4710021776162681, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7447, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.4938304629909478, + "learning_rate": 8.947841288251568e-05, + "loss": 0.7496, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.48659707936829805, + "learning_rate": 8.930658297090091e-05, + "loss": 0.8105, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.5717747968066121, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7824, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.41510218190079734, + "learning_rate": 8.896301945718541e-05, + "loss": 0.6887, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.4273989601917071, + "learning_rate": 8.879128688101749e-05, + "loss": 0.6775, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4090195999130743, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7076, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.5298938815336502, + "learning_rate": 8.844792265415738e-05, + "loss": 0.7224, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.36980990976501543, + "learning_rate": 8.827629202880293e-05, + "loss": 0.7112, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.667053722173872, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7419, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.5159519841077279, + "learning_rate": 8.793313631681915e-05, + "loss": 0.755, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.37328800367068515, + "learning_rate": 8.776161225490489e-05, + "loss": 0.6607, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4163781319007869, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7453, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.38413750861242135, + "learning_rate": 8.741867428021446e-05, + "loss": 0.6544, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.438990645670329, + "learning_rate": 8.724726139150318e-05, + "loss": 0.649, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.413988489597262, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7088, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.4047062021806597, + "learning_rate": 8.690455037067141e-05, + "loss": 0.6866, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.5325821394653242, + "learning_rate": 8.673325326193806e-05, + "loss": 0.7167, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4591998311226366, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6783, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.5025201483899181, + "learning_rate": 8.639077840543077e-05, + "loss": 0.7875, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.40676184484246597, + "learning_rate": 8.621960168033867e-05, + "loss": 0.7082, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.45801410510596946, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7613, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.41187950614770436, + "learning_rate": 8.587737219227462e-05, + "loss": 0.7598, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.5732262740881797, + "learning_rate": 8.570632045125185e-05, + "loss": 0.7329, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.3818635915803538, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6546, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.39724465874359377, + "learning_rate": 8.536434552915556e-05, + "loss": 0.7249, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.4643601322601259, + "learning_rate": 8.519342336927105e-05, + "loss": 0.7876, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.40560055104386716, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6493, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.41988583263667245, + "learning_rate": 8.485171220382545e-05, + "loss": 0.6976, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.5281061830116633, + "learning_rate": 8.468092421866573e-05, + "loss": 0.7101, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4185855771334389, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7464, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.4169364040206677, + "learning_rate": 8.433948599346516e-05, + "loss": 0.6963, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.4138958003302118, + "learning_rate": 8.416883677301069e-05, + "loss": 0.6927, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.4281538675806205, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6665, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.470171983661211, + "learning_rate": 8.382768066431425e-05, + "loss": 0.7198, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.3897485372566952, + "learning_rate": 8.36571747948162e-05, + "loss": 0.6301, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.47207872974235343, + "learning_rate": 8.348671772747487e-05, + "loss": 0.652, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.580648892234986, + "learning_rate": 8.33163099713009e-05, + "loss": 0.7559, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.37649919680147714, + "learning_rate": 8.31459520351578e-05, + "loss": 0.6825, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.46132230846384087, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6524, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.4117939450778646, + "learning_rate": 8.280538765767235e-05, + "loss": 0.633, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.5251640489626463, + "learning_rate": 8.263518223330697e-05, + "loss": 0.6484, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.5370556593506223, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7925, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.48977171085261184, + "learning_rate": 8.22949274546255e-05, + "loss": 0.727, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.38950419140082826, + "learning_rate": 8.212487911636184e-05, + "loss": 0.6583, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4513940074808248, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7201, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.44536962993224344, + "learning_rate": 8.178494308093789e-05, + "loss": 0.7007, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.4924820800581278, + "learning_rate": 8.161505639887817e-05, + "loss": 0.671, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4548878688956654, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6757, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.5097819864945549, + "learning_rate": 8.127544824259889e-05, + "loss": 0.6906, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.47639256146424586, + "learning_rate": 8.110572778250085e-05, + "loss": 0.7345, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5605924808144956, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6721, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.473831236069468, + "learning_rate": 8.076645663244168e-05, + "loss": 0.702, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.45800812471608365, + "learning_rate": 8.059690695559568e-05, + "loss": 0.7131, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.39545170615928676, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6403, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.462802428054446, + "learning_rate": 8.025798192977481e-05, + "loss": 0.7314, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.4649214253109914, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6844, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4383940632651265, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6449, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.4692348617737329, + "learning_rate": 7.975003780001485e-05, + "loss": 0.7542, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.5257629918914171, + "learning_rate": 7.958084335506239e-05, + "loss": 0.8086, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4588194630842873, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6464, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.48580849547296195, + "learning_rate": 7.924263789431912e-05, + "loss": 0.7594, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.353636881092341, + "learning_rate": 7.907362788846116e-05, + "loss": 0.7121, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.38104388079939217, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6567, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.42042891837825247, + "learning_rate": 7.873579584921869e-05, + "loss": 0.7057, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.40031403089755774, + "learning_rate": 7.856697482465196e-05, + "loss": 0.6226, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4118951225176375, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6514, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.42668133459988783, + "learning_rate": 7.822952528625191e-05, + "loss": 0.6994, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.40110478497299995, + "learning_rate": 7.806089778009421e-05, + "loss": 0.6952, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.506616334073721, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7305, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.39826405904340123, + "learning_rate": 7.772383981159849e-05, + "loss": 0.6628, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.4271293723300673, + "learning_rate": 7.755541035576677e-05, + "loss": 0.676, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.437120180626211, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7236, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.42255243493505024, + "learning_rate": 7.721875301571359e-05, + "loss": 0.6824, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.46590559822989075, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6974, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.4463527365112677, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6912, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.4464292756636874, + "learning_rate": 7.671427847296275e-05, + "loss": 0.634, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.4486764707441599, + "learning_rate": 7.654625869212146e-05, + "loss": 0.6361, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.41175580150636143, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6901, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.45036368725317394, + "learning_rate": 7.6210429741257e-05, + "loss": 0.6799, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.41771862187974396, + "learning_rate": 7.604262157407007e-05, + "loss": 0.6541, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3996706258597516, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7087, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.43454543250144245, + "learning_rate": 7.570722036168854e-05, + "loss": 0.6238, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.40885044054759473, + "learning_rate": 7.55396283180529e-05, + "loss": 0.6826, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.4164739833269892, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6845, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.4365546905048353, + "learning_rate": 7.520466385816671e-05, + "loss": 0.7073, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.4538456707835925, + "learning_rate": 7.503729244217086e-05, + "loss": 0.7201, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.37097894672741727, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6486, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.4474030266548989, + "learning_rate": 7.470277373705461e-05, + "loss": 0.6147, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.4219935785255905, + "learning_rate": 7.453562744685778e-05, + "loss": 0.655, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.44335158294294086, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7268, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.4354821147625836, + "learning_rate": 7.42015634868062e-05, + "loss": 0.684, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.4061371543693794, + "learning_rate": 7.403464681451715e-05, + "loss": 0.6952, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.5064732816872987, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6548, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.4391331847943741, + "learning_rate": 7.370104657760361e-05, + "loss": 0.6606, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.4335560681238166, + "learning_rate": 7.353436400916004e-05, + "loss": 0.7329, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4535808370951183, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6476, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.41898636185061233, + "learning_rate": 7.320123646099519e-05, + "loss": 0.6789, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.4494944636894788, + "learning_rate": 7.303479247604332e-05, + "loss": 0.7103, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.491356584275832, + "learning_rate": 7.286842901329412e-05, + "loss": 0.745, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.6830744799297692, + "learning_rate": 7.270214656953415e-05, + "loss": 0.7344, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.47208146062633694, + "learning_rate": 7.253594564130804e-05, + "loss": 0.7296, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5148113440315742, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6917, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.46249237893405354, + "learning_rate": 7.22037903164173e-05, + "loss": 0.6511, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.4333329759022181, + "learning_rate": 7.203783691161883e-05, + "loss": 0.7026, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.45181958290943725, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6934, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.4967025867464761, + "learning_rate": 7.170618109512465e-05, + "loss": 0.7365, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.44556605284460443, + "learning_rate": 7.154047967380354e-05, + "loss": 0.7594, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.43844495879403456, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7302, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.4447741052768386, + "learning_rate": 7.12093322790597e-05, + "loss": 0.6862, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.4044188770090188, + "learning_rate": 7.104388729449338e-05, + "loss": 0.6946, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5415468788093395, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7217, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.39803291072156194, + "learning_rate": 7.071325722118963e-05, + "loss": 0.6049, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.42524764548934013, + "learning_rate": 7.054807311976379e-05, + "loss": 0.6956, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.40243540895739494, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6944, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.41108116013920964, + "learning_rate": 7.021796925368667e-05, + "loss": 0.6781, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.4180256922431302, + "learning_rate": 7.005305047477566e-05, + "loss": 0.7563, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.37883994845143076, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6508, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.48577703163002417, + "learning_rate": 6.972348168756983e-05, + "loss": 0.6649, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.4496672306844674, + "learning_rate": 6.955883266341741e-05, + "loss": 0.6892, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.440269970232495, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6979, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.43144139434154993, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7325, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.4567799170805063, + "learning_rate": 6.906543296794714e-05, + "loss": 0.6869, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.45884689535164136, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6246, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.4612139017442247, + "learning_rate": 6.873696089565786e-05, + "loss": 0.633, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.38772962724409904, + "learning_rate": 6.85728646486359e-05, + "loss": 0.6571, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.49487000040161555, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7017, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.39082581245765496, + "learning_rate": 6.82449541829174e-05, + "loss": 0.6365, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.4691915029959711, + "learning_rate": 6.80811409434113e-05, + "loss": 0.696, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.4654745459694697, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6701, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.4960853830959196, + "learning_rate": 6.775380089695986e-05, + "loss": 0.7386, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.41269755829152927, + "learning_rate": 6.759027506750158e-05, + "loss": 0.7021, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.6735254066940529, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7665, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.5474011650485257, + "learning_rate": 6.726351423768322e-05, + "loss": 0.7174, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.397827080451777, + "learning_rate": 6.710028021308061e-05, + "loss": 0.6678, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.43277723143519836, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6794, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.4678172416628807, + "learning_rate": 6.677410738169485e-05, + "loss": 0.7276, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.47678291661801986, + "learning_rate": 6.661116954891328e-05, + "loss": 0.7016, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.38415068856345364, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6685, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.38330519173168615, + "learning_rate": 6.62855934819569e-05, + "loss": 0.6694, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.45012001154278036, + "learning_rate": 6.612295622000162e-05, + "loss": 0.6893, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.36465998860021354, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6549, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.4089045562238243, + "learning_rate": 6.579798566743314e-05, + "loss": 0.655, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.401000896754143, + "learning_rate": 6.563565334723134e-05, + "loss": 0.6687, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.45878175518310715, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7239, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.49041075314677157, + "learning_rate": 6.531129704273604e-05, + "loss": 0.734, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.39186563776406375, + "learning_rate": 6.514927402701964e-05, + "loss": 0.6688, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4914328053426828, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7282, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.40625192142635563, + "learning_rate": 6.48255406877745e-05, + "loss": 0.6386, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.3748063684152412, + "learning_rate": 6.466383133096267e-05, + "loss": 0.6615, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5215673018623856, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6575, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.504226772605567, + "learning_rate": 6.434072965740242e-05, + "loss": 0.6481, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.399025245567611, + "learning_rate": 6.417933830548467e-05, + "loss": 0.71, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.3330948835069077, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6139, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.4958690052247975, + "learning_rate": 6.385687698106781e-05, + "loss": 0.7787, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.518376029324385, + "learning_rate": 6.369580797148718e-05, + "loss": 0.7002, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.43775005862410454, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6702, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.564111859142483, + "learning_rate": 6.337399566246257e-05, + "loss": 0.7696, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.4406942671738771, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6969, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.4294131443095355, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7576, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.4005698087450695, + "learning_rate": 6.289209867917312e-05, + "loss": 0.6544, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.4229655347526881, + "learning_rate": 6.273168733182722e-05, + "loss": 0.7072, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4354204824694456, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7203, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.4090693349291068, + "learning_rate": 6.241119898233144e-05, + "loss": 0.7133, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.44127117465359905, + "learning_rate": 6.225112293720836e-05, + "loss": 0.7059, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.40594685075351816, + "learning_rate": 6.209115961596208e-05, + "loss": 0.688, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.4479936053531853, + "learning_rate": 6.19313094962673e-05, + "loss": 0.6753, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.44240262318228424, + "learning_rate": 6.177157305546078e-05, + "loss": 0.647, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4702963243682181, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7452, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.4403693769384026, + "learning_rate": 6.145244311816063e-05, + "loss": 0.7167, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.40999692307058, + "learning_rate": 6.129305057463741e-05, + "loss": 0.6198, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.5226868856788982, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7573, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.39179328679742803, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6695, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.42774356488428983, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.7275, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.43943837571035477, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7273, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.552480001257446, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.7572, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.48459244632161336, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.6398, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.41043053451708134, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6884, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.4107179284069568, + "learning_rate": 6.002211118886514e-05, + "loss": 0.7124, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.43047382909893095, + "learning_rate": 5.986377600199371e-05, + "loss": 0.6733, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.39600325407900394, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6283, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.41500514219441187, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.6449, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.4649038637148329, + "learning_rate": 5.938949144798279e-05, + "loss": 0.7128, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3742635905884641, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6253, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.49873594095049717, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6567, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.5030172499643145, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.7176, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.38035823222941917, + "learning_rate": 5.875881200614207e-05, + "loss": 0.648, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.403454227071662, + "learning_rate": 5.860144885064751e-05, + "loss": 0.608, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.39566732890510886, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.6571, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.39833030979330514, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6795, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.4622437312038266, + "learning_rate": 5.813010299610313e-05, + "loss": 0.7967, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.4125561000297353, + "learning_rate": 5.797323714580192e-05, + "loss": 0.6698, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.41048898462940947, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6844, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.43286647820008745, + "learning_rate": 5.765988240812921e-05, + "loss": 0.7044, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.487024153960826, + "learning_rate": 5.750339445648252e-05, + "loss": 0.7095, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.49409516688134575, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6861, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.39803526419291674, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.6631, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.4287146084213074, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.721, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.46736119936228177, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7044, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.4386736030522742, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.7027, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.4404033746840516, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.6616, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.45784499125755806, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6221, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.45788776415686483, + "learning_rate": 5.625609846363622e-05, + "loss": 0.7006, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.43731077678181945, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.6902, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.6796396357166941, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7697, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.42635858623764006, + "learning_rate": 5.579050500768836e-05, + "loss": 0.6844, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.5295579742057989, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.7399, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.41825020566671456, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7376, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.4803419664026688, + "learning_rate": 5.53260996957381e-05, + "loss": 0.778, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.8095754262667553, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.6842, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.37785107552632613, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6701, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.46132617217855576, + "learning_rate": 5.486289500882355e-05, + "loss": 0.6356, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.5269509945864841, + "learning_rate": 5.47087624046575e-05, + "loss": 0.6769, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3922689653386676, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.645, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.555143415465988, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.7452, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.4475682529570901, + "learning_rate": 5.424717791025302e-05, + "loss": 0.7474, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.46825839397048047, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.576, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.47736062364675896, + "learning_rate": 5.394013727258254e-05, + "loss": 0.7407, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.41386791103089915, + "learning_rate": 5.378682303724435e-05, + "loss": 0.6498, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.4115071829091007, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6727, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.38598858770760563, + "learning_rate": 5.348060902265871e-05, + "loss": 0.6473, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.4288611226500625, + "learning_rate": 5.332771015781275e-05, + "loss": 0.6626, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.39767727632829636, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6833, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.5055050883992983, + "learning_rate": 5.302233099590928e-05, + "loss": 0.6983, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5112414029165399, + "learning_rate": 5.286985161076029e-05, + "loss": 0.6303, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.42466226932803275, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6509, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.43147014942641604, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.6773, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.44918413339986263, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6996, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.40487296854779825, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6872, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.4715949353339733, + "learning_rate": 5.210957484346314e-05, + "loss": 0.6799, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.6058533316508485, + "learning_rate": 5.195794670011776e-05, + "loss": 0.7764, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.42200509928999985, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7224, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.44729875600539715, + "learning_rate": 5.165512124837344e-05, + "loss": 0.6985, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.6017859491906349, + "learning_rate": 5.150392484425728e-05, + "loss": 0.6907, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.5070518017224357, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7038, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.49813536052165636, + "learning_rate": 5.120196693701267e-05, + "loss": 0.7194, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.4184170317362622, + "learning_rate": 5.105120633557634e-05, + "loss": 0.6385, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.5010532458438172, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7317, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.5005339987888526, + "learning_rate": 5.075012408804458e-05, + "loss": 0.7765, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.5213241438385137, + "learning_rate": 5.059980334102637e-05, + "loss": 0.669, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.45957806458841644, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7257, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.4441361698356745, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.6101, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.4429998799364062, + "learning_rate": 5.014972799220403e-05, + "loss": 0.7337, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.40723007461235616, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6965, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.41413620831446507, + "learning_rate": 4.985042131538545e-05, + "loss": 0.7109, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.436997370362431, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.7481, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4192542523884157, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7023, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.3684514145841737, + "learning_rate": 4.940258557148765e-05, + "loss": 0.6542, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.4078503684704817, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.6315, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.4654049362770654, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7958, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.47020760443462817, + "learning_rate": 4.895610964891923e-05, + "loss": 0.7046, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.3660858077712867, + "learning_rate": 4.880758859890536e-05, + "loss": 0.6759, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.41944031281785055, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6306, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.44192112888266855, + "learning_rate": 4.851100554686021e-05, + "loss": 0.7037, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.39563893570133474, + "learning_rate": 4.836294443047088e-05, + "loss": 0.6927, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4527215345915951, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7066, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.528383811512297, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.7627, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.4226464527745641, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6755, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4474780169862844, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7362, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.4972514215365334, + "learning_rate": 4.762496061632814e-05, + "loss": 0.694, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.4945500820847561, + "learning_rate": 4.747783129228656e-05, + "loss": 0.6474, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.41877512278041723, + "learning_rate": 4.733085880741301e-05, + "loss": 0.5882, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.483327908182511, + "learning_rate": 4.718404360058966e-05, + "loss": 0.6754, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.4604880955395153, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.6747, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.45350310620758705, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6913, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.3920930895298731, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.655, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.37192017843441927, + "learning_rate": 4.659836431497563e-05, + "loss": 0.6205, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.37001840877376735, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6218, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.4198155173809062, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6269, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.5257923572584187, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.725, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4231819929149378, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7147, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.37574676564116133, + "learning_rate": 4.586985643347717e-05, + "loss": 0.5911, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.4125533536902336, + "learning_rate": 4.572463804170263e-05, + "loss": 0.623, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.41763382077089345, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6186, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.47045704844553404, + "learning_rate": 4.543468791472131e-05, + "loss": 0.6692, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.43950313219525144, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.6338, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.44850994906610586, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7587, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.40076973487863327, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.6528, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.4763486099000397, + "learning_rate": 4.485674639850333e-05, + "loss": 0.6593, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.42317066038164286, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6815, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.5334792578675287, + "learning_rate": 4.456876191254582e-05, + "loss": 0.7538, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.4563517816630481, + "learning_rate": 4.442501774383515e-05, + "loss": 0.7655, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.457088132652322, + "learning_rate": 4.428143953045717e-05, + "loss": 0.764, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.45315406826456367, + "learning_rate": 4.413802770115816e-05, + "loss": 0.7333, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.4785175570304063, + "learning_rate": 4.399478268418771e-05, + "loss": 0.6752, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.5193290540475145, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6766, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.39166249453443075, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.6641, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.4444251457161952, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6808, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4525915252002046, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6637, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.4445375117875077, + "learning_rate": 4.328107473805487e-05, + "loss": 0.7181, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.4412186282350355, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.6649, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4272874352081193, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6575, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.4992767279616866, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.7525, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.48090114753757046, + "learning_rate": 4.271315449981934e-05, + "loss": 0.7477, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.40733017554217016, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6759, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.4498923712920087, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.6781, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.3990536280518104, + "learning_rate": 4.228900904120895e-05, + "loss": 0.6663, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.4970635315232516, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7522, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.4744644664117155, + "learning_rate": 4.200710636738189e-05, + "loss": 0.765, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.3346482618704202, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.5854, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.41626631799201624, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6726, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.3640006285780399, + "learning_rate": 4.158555222253771e-05, + "loss": 0.6425, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.45944766881500854, + "learning_rate": 4.14453824841132e-05, + "loss": 0.7496, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.5295144756125102, + "learning_rate": 4.130538759866457e-05, + "loss": 0.633, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.47970524929365166, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.6483, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.5288382109707465, + "learning_rate": 4.102592405835536e-05, + "loss": 0.6519, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4273640335221985, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7136, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.3698460538893361, + "learning_rate": 4.074716493968975e-05, + "loss": 0.6415, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.4961002979751885, + "learning_rate": 4.060805057932359e-05, + "loss": 0.7749, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.3992713891696009, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6803, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.35674124778456545, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.607, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.3969586298474553, + "learning_rate": 4.019177327749822e-05, + "loss": 0.6145, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.4396329628072576, + "learning_rate": 4.00533708178334e-05, + "loss": 0.74, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.5244884710162703, + "learning_rate": 3.991514736790258e-05, + "loss": 0.6471, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.4569481617674653, + "learning_rate": 3.977710334046193e-05, + "loss": 0.7491, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.4407543083560062, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6717, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.3996784940180213, + "learning_rate": 3.950155520139581e-05, + "loss": 0.6755, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.4514537088669984, + "learning_rate": 3.936405191259891e-05, + "loss": 0.6173, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3664007104725477, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6495, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.4845923303179934, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.5729, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.5131027825325541, + "learning_rate": 3.895263009479534e-05, + "loss": 0.7162, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.4870252273445952, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6333, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.4382846518264229, + "learning_rate": 3.867925968395085e-05, + "loss": 0.6344, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.45462423014598224, + "learning_rate": 3.854284894414122e-05, + "loss": 0.7179, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.38601592208424074, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6247, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.5588254869023244, + "learning_rate": 3.82705784324618e-05, + "loss": 0.6824, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.38423012071444157, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.6414, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.36722452410125017, + "learning_rate": 3.79990452539225e-05, + "loss": 0.5833, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.4327130367411745, + "learning_rate": 3.786355617847385e-05, + "loss": 0.7005, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.4903701302239034, + "learning_rate": 3.772825265187802e-05, + "loss": 0.6898, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.40168516926848963, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6793, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.44376164501594506, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.6857, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.3587503335507288, + "learning_rate": 3.732345940279893e-05, + "loss": 0.5688, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4934285243182926, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7796, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.4404991951999115, + "learning_rate": 3.705453237352227e-05, + "loss": 0.652, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.3647155002598664, + "learning_rate": 3.692035060534088e-05, + "loss": 0.6375, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.49974533015504735, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6908, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.39018185328648913, + "learning_rate": 3.665255256532638e-05, + "loss": 0.6532, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.37228594811392435, + "learning_rate": 3.651893709317887e-05, + "loss": 0.6379, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.3971951160318089, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6252, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.3599070866386319, + "learning_rate": 3.625227523958252e-05, + "loss": 0.5969, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.4379513340178431, + "learning_rate": 3.611922965442648e-05, + "loss": 0.6774, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3998705472017563, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6405, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.41760982475547026, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.6829, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.4366260280255711, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.6926, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4106177310144535, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7056, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.3921999038233143, + "learning_rate": 3.545687101972013e-05, + "loss": 0.6919, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.41674680016384474, + "learning_rate": 3.53249759200601e-05, + "loss": 0.647, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4119048309083103, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6498, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.40572103472794274, + "learning_rate": 3.506176550233863e-05, + "loss": 0.7052, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.39057808572159125, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6491, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.491566833277267, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6583, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.4762257430976011, + "learning_rate": 3.46684052203088e-05, + "loss": 0.6663, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.5445065980715168, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.7097, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.4543710104572786, + "learning_rate": 3.440713983000601e-05, + "loss": 0.689, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.407829920159229, + "learning_rate": 3.427680074531113e-05, + "loss": 0.6778, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.41732549426387294, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.61, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.5353744556728132, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7306, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.4175265894701234, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6786, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.43922642748880425, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.737, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4591942338879461, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6677, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.42002570865671857, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.6193, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.4538601995375422, + "learning_rate": 3.336994413891828e-05, + "loss": 0.7202, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3831032040092475, + "learning_rate": 3.324118597838464e-05, + "loss": 0.601, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.43483694718557675, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.6504, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.433441180853285, + "learning_rate": 3.298426809706928e-05, + "loss": 0.6137, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.4574223889763118, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6979, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.576370653686149, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.7514, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.5090825579316304, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.6658, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.4570716372493963, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7022, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.4243893702053848, + "learning_rate": 3.234548216567049e-05, + "loss": 0.721, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.40463911930908225, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.6872, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4691413149126142, + "learning_rate": 3.209137931341143e-05, + "loss": 0.568, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.4410995107571646, + "learning_rate": 3.196463187590929e-05, + "loss": 0.6605, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.45395735012404703, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.678, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4455123253836739, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6884, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.466476454337529, + "learning_rate": 3.158561005793402e-05, + "loss": 0.7379, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.45254947510936605, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6483, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5105718886675502, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6524, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.4153774316188661, + "learning_rate": 3.120842689807468e-05, + "loss": 0.6585, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.6287557308304093, + "learning_rate": 3.108310952230212e-05, + "loss": 0.644, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.41060708920000044, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6393, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.4584936591156118, + "learning_rate": 3.083309253324651e-05, + "loss": 0.6834, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.42760298449155615, + "learning_rate": 3.070839366655215e-05, + "loss": 0.6391, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.40847995117532254, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6677, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.38598233718275865, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.612, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.4154384657504431, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6217, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.6660104658498806, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7194, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.4231194901746939, + "learning_rate": 3.008801048763914e-05, + "loss": 0.6385, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.4653297067314939, + "learning_rate": 2.996455867635155e-05, + "loss": 0.7533, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.45689436023906294, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7248, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.40256887514928125, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6613, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.6204123648495595, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.7044, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.43213476883749513, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6409, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.4646837388697919, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6921, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.45353160693893724, + "learning_rate": 2.922825253307947e-05, + "loss": 0.6904, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5023230394327901, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7396, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.3935264738094166, + "learning_rate": 2.898450393337977e-05, + "loss": 0.6477, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.48635306499044456, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.6998, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.4091423751364748, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6265, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.40384469236774845, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.6999, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.4120121582534377, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.6909, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.39083906952401376, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6236, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.41218289972349514, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.664, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.39384549030326926, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.6389, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4632752977068213, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6344, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.3773831474501387, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.639, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.5038423659696732, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.7311, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.44071340842899487, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6005, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.3992398623624981, + "learning_rate": 2.753992680872457e-05, + "loss": 0.6808, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.41390857698005834, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6412, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.5241813239086165, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6991, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.3994989775650419, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.6192, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.4291846570848238, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.6488, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.41504568841541833, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6927, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.5175577081278326, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.6336, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.4173679144944115, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.676, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.560320983926784, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6901, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.5554519744795914, + "learning_rate": 2.647690737490106e-05, + "loss": 0.7598, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.480224673602412, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.632, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.38508153551994906, + "learning_rate": 2.6243086879379e-05, + "loss": 0.664, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.46055633682093294, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.7333, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.3961097454084375, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6506, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.735253120855245, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7185, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.3489013592721229, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6212, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.3889928529352225, + "learning_rate": 2.566239608465838e-05, + "loss": 0.6171, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.46397922957246035, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6655, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.39166678734884736, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6863, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.5787530599310754, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.7363, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.5958819013877672, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6994, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.4201426615307166, + "learning_rate": 2.508725484101684e-05, + "loss": 0.5715, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.39623704747143423, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.5991, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4139662710085164, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6299, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.33924102653209187, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.5845, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.4007066797410344, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.6402, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.434895471979439, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6852, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.3921533618694153, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.6419, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.41674674428686015, + "learning_rate": 2.429146201687538e-05, + "loss": 0.7331, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4404240216773432, + "learning_rate": 2.417867893002387e-05, + "loss": 0.638, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.5100165492429559, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6815, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.45911938575850225, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.6395, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4188734839089894, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.673, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.4194909092789948, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.6896, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.3734397655475053, + "learning_rate": 2.361816641743303e-05, + "loss": 0.6286, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.418794631207735, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7151, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.3668294531327107, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6247, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.5056859106234943, + "learning_rate": 2.328459328616759e-05, + "loss": 0.6627, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5031974197168696, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6123, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.36770625640776916, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6253, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.3950678989681755, + "learning_rate": 2.295308190543859e-05, + "loss": 0.6347, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.424691665342637, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6422, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.3932538680943357, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.6071, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.40241094021518636, + "learning_rate": 2.262364118471805e-05, + "loss": 0.6719, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.4361459984843694, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6396, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.3982081342410596, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.6533, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.3685938016957018, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.6251, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4343032209940338, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7168, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.33962441541583616, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6092, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.45358695497249857, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.6314, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.4147278661270302, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6418, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.43803549259230773, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.6794, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.463212491061389, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.6362, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4166521400201831, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6285, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.46747358714893084, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6832, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.506818733087523, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.6906, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.6232152586172739, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.611, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.398009057094343, + "learning_rate": 2.111388852214001e-05, + "loss": 0.7032, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.4954796533712989, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.7264, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.40108521022798854, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.655, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.45598380321275594, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.6988, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.3871915430991362, + "learning_rate": 2.069097260929439e-05, + "loss": 0.6438, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.42284413388889525, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6793, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.38536687081551113, + "learning_rate": 2.048093436450603e-05, + "loss": 0.6552, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.49040526777930155, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.7177, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.372303282355137, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6111, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.4734318146302882, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.625, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.44864014345408976, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.7238, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4148463663422953, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6828, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.4474148187744677, + "learning_rate": 1.985652854842247e-05, + "loss": 0.6902, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.5980958684596339, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.6955, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.44142730805286245, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6299, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.4330012446563654, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.5907, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.5569374098186959, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.7162, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4563590039156943, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6869, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.49580207639687396, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.6722, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.4120313480188349, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.7161, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.42589014999870267, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6424, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.38193578525540006, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.6497, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.6245364403023802, + "learning_rate": 1.883503039577894e-05, + "loss": 0.6005, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.37241477418400815, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6059, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.39893356007131425, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.6535, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.4889994395764013, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.68, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.33046179141381127, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.5902, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.5711800874080143, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.7674, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.4549457289530646, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.6665, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.5699317027188194, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7299, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.4020798210602477, + "learning_rate": 1.803526775107217e-05, + "loss": 0.6339, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.3995482232637711, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.6528, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.44557820935371334, + "learning_rate": 1.783776873795994e-05, + "loss": 0.5653, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.4233355126969602, + "learning_rate": 1.773938710748706e-05, + "loss": 0.6653, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.4261010144722392, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.6439, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.4386345128850192, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6489, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.45087774496786187, + "learning_rate": 1.744571724358789e-05, + "loss": 0.6529, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.3975402796187728, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.6548, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.5480017096708946, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7525, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.3942083871662254, + "learning_rate": 1.715426605184407e-05, + "loss": 0.642, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.45959459072184644, + "learning_rate": 1.705761004839911e-05, + "loss": 0.7472, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.584546316076038, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7557, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.41340745771988363, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6449, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.44490990245648465, + "learning_rate": 1.676912926028007e-05, + "loss": 0.6575, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.38421477120172864, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.7134, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.5435579560675197, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.6273, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.42269084778287574, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.5812, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.44831986276577557, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7349, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.4834054201932014, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.6461, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.3814636033243915, + "learning_rate": 1.619888594394382e-05, + "loss": 0.6315, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4487163646690131, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6516, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.45954131188099634, + "learning_rate": 1.601080376443763e-05, + "loss": 0.6191, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.5934862372056542, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.6278, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.47139248381987847, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7213, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.4329217337090352, + "learning_rate": 1.573056222621453e-05, + "loss": 0.6571, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.40005387036558254, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.6529, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.46448885085215247, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7176, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.5931599643544335, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.7218, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.5804676849521008, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6989, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.5437723742515328, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6913, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.4471021431166154, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6592, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.40378540338053776, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.6474, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.44201360523337074, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6388, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.46975363512733703, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.6647, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.41694059202066175, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.6641, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.43436459491647883, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7326, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.4914570957874253, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.6595, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.443023839564708, + "learning_rate": 1.454244833620102e-05, + "loss": 0.6915, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.37100018571953414, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6382, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.42875477380139815, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.6479, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.4485483572251155, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.65, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.4906779644680305, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6578, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.5061243094311983, + "learning_rate": 1.409693244743192e-05, + "loss": 0.73, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.41422267631820975, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.6828, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5606618613098131, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7368, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.3522095805748436, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.5896, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.38823566997342707, + "learning_rate": 1.37451354812416e-05, + "loss": 0.608, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4045178917126892, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6504, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.376864282481641, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.6924, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.4513033853517228, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.6463, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.39152204105908295, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6315, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.41193491535153914, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6635, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.33939290424227786, + "learning_rate": 1.322517230541096e-05, + "loss": 0.5928, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.502490061859469, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6716, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.37138805758821103, + "learning_rate": 1.30539214797198e-05, + "loss": 0.6555, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.345152734180387, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.5893, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.36996512317730207, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6358, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.4232283639253023, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.6177, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.45832069051361574, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.6688, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.40990727691313, + "learning_rate": 1.263034245443473e-05, + "loss": 0.68, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.38082701571782146, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.6278, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.40993562802627775, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.633, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4123608049941386, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6222, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.40104591477234164, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.6481, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.4793488240086187, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.7249, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.36096504113832656, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6014, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.47041779848564563, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.6758, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.48596104980486965, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.6595, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.42683599920343207, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6439, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.42736175129702464, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.6612, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.5777785499809717, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.7469, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.39216045037865677, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6363, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.4561800936774624, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.7438, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.4943567026640203, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.7048, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4814877788409868, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7268, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.4092600930058757, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.6065, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.409685982408688, + "learning_rate": 1.123914688596409e-05, + "loss": 0.6506, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3870958481224451, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6187, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.41553703471203224, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.621, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.4093322391584789, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.6051, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.47041694634040865, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6844, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.418429662076582, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.6589, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.3829077339559944, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.6478, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.4534298654346946, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.7143, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.4859898034600532, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.6448, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.46432539091272734, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.6885, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5085493694017713, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7544, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.4473521312310639, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.6196, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.4747926932747285, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.7073, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4009158607052678, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6103, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.4475180779475372, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.6572, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.3884492418674599, + "learning_rate": 1.007519208596045e-05, + "loss": 0.6477, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4332361050436333, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6365, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.45285475697097116, + "learning_rate": 9.924546254786493e-06, + "loss": 0.6509, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.37958600626287053, + "learning_rate": 9.849626695403324e-06, + "loss": 0.6218, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4082527332253524, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6273, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.45348682552554404, + "learning_rate": 9.700595407649805e-06, + "loss": 0.6677, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.462376062388315, + "learning_rate": 9.62648412430951e-06, + "loss": 0.6577, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.4145611018534509, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6881, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.49556932106832846, + "learning_rate": 9.479071385238892e-06, + "loss": 0.6319, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.41068976055254247, + "learning_rate": 9.40577036970538e-06, + "loss": 0.7006, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.43485989501407807, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6392, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.50740804285652, + "learning_rate": 9.259980141081115e-06, + "loss": 0.7311, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4047173072522602, + "learning_rate": 9.187491363342093e-06, + "loss": 0.6593, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3894499674015839, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6533, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.3659261227794051, + "learning_rate": 9.043327563322112e-06, + "loss": 0.614, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.451797701464948, + "learning_rate": 8.971652971536148e-06, + "loss": 0.6973, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.49015568983154395, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6806, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.4382845781834796, + "learning_rate": 8.829119474567671e-06, + "loss": 0.6291, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.4391521683429244, + "learning_rate": 8.758260995011825e-06, + "loss": 0.7008, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4304589058729017, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6583, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.4864114334043858, + "learning_rate": 8.617361631727138e-06, + "loss": 0.7412, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.43922743441940315, + "learning_rate": 8.547321168745193e-06, + "loss": 0.6269, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.5474810445951583, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7699, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.41757604429661016, + "learning_rate": 8.408059725858719e-06, + "loss": 0.6052, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.4167594304186119, + "learning_rate": 8.338839161809997e-06, + "loss": 0.6862, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.37822319125493636, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6262, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.5023340764183061, + "learning_rate": 8.201219382016556e-06, + "loss": 0.6251, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.4079889689772476, + "learning_rate": 8.132820577225387e-06, + "loss": 0.5957, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.406420779466018, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6523, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.43747980019701993, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6769, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.41352787710817335, + "learning_rate": 7.929270951805178e-06, + "loss": 0.7113, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.40583860313023423, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6809, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.41158724418458603, + "learning_rate": 7.794945549701993e-06, + "loss": 0.6402, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.5243531711929061, + "learning_rate": 7.728195756009204e-06, + "loss": 0.673, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.337576549622103, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6261, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.4754446973458503, + "learning_rate": 7.595522979965819e-06, + "loss": 0.7401, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.4164840399768914, + "learning_rate": 7.529600393796232e-06, + "loss": 0.6357, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.4135462920490355, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6859, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.7234470340955943, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.7088, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.36432432756450434, + "learning_rate": 7.333490202478666e-06, + "loss": 0.6534, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.48065737527318425, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6908, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.3932409524931165, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6908, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.4346120600494569, + "learning_rate": 7.1398704525792e-06, + "loss": 0.5935, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.44275062077364813, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6455, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.392220127998605, + "learning_rate": 7.012176770311862e-06, + "loss": 0.6515, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.4643778577168819, + "learning_rate": 6.948746347689183e-06, + "loss": 0.6172, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4274755369987598, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6109, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.46868591713180785, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.6145, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.5042715565932409, + "learning_rate": 6.760123024328624e-06, + "loss": 0.7054, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4154466335564727, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5967, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.4216382747965016, + "learning_rate": 6.635765971293484e-06, + "loss": 0.5687, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.42449643103424906, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.6767, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.4001245042411552, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6211, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.47478289908225485, + "learning_rate": 6.451321849032288e-06, + "loss": 0.6765, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.38673287441234144, + "learning_rate": 6.390398932093555e-06, + "loss": 0.6299, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4831165164661389, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6253, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.4254562070627475, + "learning_rate": 6.269391876739495e-06, + "loss": 0.6319, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.44905117536001765, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6273, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.4670117155323974, + "learning_rate": 6.149504395842087e-06, + "loss": 0.7235, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.39418772023176973, + "learning_rate": 6.089980943839924e-06, + "loss": 0.5481, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.4756135363588192, + "learning_rate": 6.030737921409169e-06, + "loss": 0.644, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.41633182755771625, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6518, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.4621793648032974, + "learning_rate": 5.913093872058528e-06, + "loss": 0.7258, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.4323272116296017, + "learning_rate": 5.854693196441641e-06, + "loss": 0.6358, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5295944212721158, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.617, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.4364076762764822, + "learning_rate": 5.738735415290642e-06, + "loss": 0.6217, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.5300885117518196, + "learning_rate": 5.681178656024055e-06, + "loss": 0.7928, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.41327557859176295, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6586, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.7325507837569631, + "learning_rate": 5.566910259474289e-06, + "loss": 0.6572, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.40786063676707335, + "learning_rate": 5.510198963413881e-06, + "loss": 0.5755, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4272122242567254, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6983, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.3998728235135429, + "learning_rate": 5.397623022464226e-06, + "loss": 0.607, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.5348664801254471, + "learning_rate": 5.341758713743828e-06, + "loss": 0.7396, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3995469349406626, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6787, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.46000290049490755, + "learning_rate": 5.230878253907912e-06, + "loss": 0.7569, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.46010987639731693, + "learning_rate": 5.175862433898282e-06, + "loss": 0.6347, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4978647672880174, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6699, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.35737868752140106, + "learning_rate": 5.066680435123106e-06, + "loss": 0.6557, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.39647211628608875, + "learning_rate": 5.012514582391592e-06, + "loss": 0.7077, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4513532218109532, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6431, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.4457417615670569, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6464, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.5455223307685727, + "learning_rate": 4.851719549248301e-06, + "loss": 0.751, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.5051881456825834, + "learning_rate": 4.798689246727006e-06, + "loss": 0.7221, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.5309592517243472, + "learning_rate": 4.745943229770122e-06, + "loss": 0.6432, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.4867932356703566, + "learning_rate": 4.693481655885257e-06, + "loss": 0.6611, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.5584650616591511, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7606, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.4728468412522664, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6698, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.42002690440527485, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6121, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.39957076551971166, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6648, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.44850496464790096, + "learning_rate": 4.435445885824285e-06, + "loss": 0.6803, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.4436197128959715, + "learning_rate": 4.384694230432984e-06, + "loss": 0.6027, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4042949955641835, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6297, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.5007062642674075, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.7184, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.4205208053626174, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.6676, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4431673816438805, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7164, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.4249690226422859, + "learning_rate": 4.135221781914034e-06, + "loss": 0.6665, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.4936236618332871, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.6611, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.4446079264504513, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5919, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.4188549944038524, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6815, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.37412575666763476, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.5891, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.3973365621648777, + "learning_rate": 3.892905960127546e-06, + "loss": 0.5576, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.5505908525386042, + "learning_rate": 3.845303192289074e-06, + "loss": 0.6993, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.39051526021548083, + "learning_rate": 3.797987556970495e-06, + "loss": 0.5802, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4588299711160169, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6591, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.43305516598913746, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.6858, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.4181562679597887, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.6112, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.5274864643828572, + "learning_rate": 3.611599153858214e-06, + "loss": 0.7102, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.48775833418327424, + "learning_rate": 3.565721283350931e-06, + "loss": 0.695, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.5580209662077895, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.755, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.40488479521891896, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.677, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.4199604648333833, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6491, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.3844521137272803, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.6531, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.48731897208695457, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7045, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.5383857503345972, + "learning_rate": 3.296506110302422e-06, + "loss": 0.7467, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.3962907679692899, + "learning_rate": 3.252646840332918e-06, + "loss": 0.6084, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4363993116628175, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6224, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.3815183988303869, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6161, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.35610263564934913, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.629, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.40053521504278355, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6371, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.4369059990678778, + "learning_rate": 3.037686613916857e-06, + "loss": 0.6471, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.40213261631132446, + "learning_rate": 2.995562691985898e-06, + "loss": 0.6986, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5612624532899247, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7653, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.5538150635552385, + "learning_rate": 2.912183982969385e-06, + "loss": 0.6946, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.4705386625620636, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.6737, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.38927429742939385, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6392, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.4248654288152213, + "learning_rate": 2.789290617426765e-06, + "loss": 0.6731, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.6866722643448364, + "learning_rate": 2.748906571878207e-06, + "loss": 0.685, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.45430216283323627, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6642, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.4908708531043127, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.6784, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.39701150718508316, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.5999, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.43096087129001465, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6403, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.3358603543528395, + "learning_rate": 2.551344823532964e-06, + "loss": 0.5447, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.420568826048583, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.639, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.4656415532714179, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6152, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.3394905390020499, + "learning_rate": 2.436298790049363e-06, + "loss": 0.5866, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.38209415876202724, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.6513, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.39565128177302533, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6567, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.4285550280410376, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.6859, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.40854938238705735, + "learning_rate": 2.286983355164529e-06, + "loss": 0.6726, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4158470411645834, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6526, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.5060676799282359, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.6022, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.42024963141870625, + "learning_rate": 2.178060137750071e-06, + "loss": 0.6029, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.3818807214931423, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6145, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.41950692764051695, + "learning_rate": 2.106905034576112e-06, + "loss": 0.6712, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.4948719627454073, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.7097, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.42376708716125905, + "learning_rate": 2.036919225091827e-06, + "loss": 0.685, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.46599713659798614, + "learning_rate": 2.002365067264289e-06, + "loss": 0.5762, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.502423239699822, + "learning_rate": 1.968103545249611e-06, + "loss": 0.6482, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.41704783463295836, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6231, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.4471318403366392, + "learning_rate": 1.900458817025097e-06, + "loss": 0.58, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.4930432533346224, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.6066, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4191093443697424, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6389, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.4643665707610858, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.6667, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.39975257989388546, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.6552, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.46064028104022436, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6825, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.4004501369453656, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.6653, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.44625854340376037, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.7159, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.4434161255187958, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6436, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.4344127642206033, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.6412, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.40063459126374223, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.6225, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.43356689270483173, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6219, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.5220526369818685, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.6594, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.5423024778568717, + "learning_rate": 1.489364501100332e-06, + "loss": 0.7521, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4004007956034665, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6324, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.46326132066593056, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6959, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.5839550416919262, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.6809, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.45523305572258604, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6495, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.4597435532782685, + "learning_rate": 1.344477780953346e-06, + "loss": 0.6678, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.424409525577842, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.6858, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.46094318525091577, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6388, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.5355779994156216, + "learning_rate": 1.261080262743297e-06, + "loss": 0.6435, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.3531036043502969, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.545, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.3698317888394057, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.5993, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.5431907428668711, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.8041, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.4553855167796496, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6571, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4350567834631104, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6423, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.4747873187224694, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.6739, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.40461884370271717, + "learning_rate": 1.076809502472831e-06, + "loss": 0.72, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4167798030556322, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6255, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.46845901849429383, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6545, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.3973222335473333, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6372, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.38073284437655835, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6289, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.38073722863953624, + "learning_rate": 9.540479264726676e-07, + "loss": 0.5871, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.4190946658205246, + "learning_rate": 9.303826211592315e-07, + "loss": 0.602, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4337641950485008, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6911, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.4265497198339448, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6969, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.41996031012540436, + "learning_rate": 8.611620049653879e-07, + "loss": 0.6376, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.5499886782502461, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7559, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.3828713210153735, + "learning_rate": 8.16495030759501e-07, + "loss": 0.5777, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.4611247428551417, + "learning_rate": 7.946057760332193e-07, + "loss": 0.6816, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.36152230735902663, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6159, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.4419954083812373, + "learning_rate": 7.517160581569372e-07, + "loss": 0.7064, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.4265143979697728, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6554, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.39116645586599447, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6161, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.567563131148666, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6576, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.5093875925229141, + "learning_rate": 6.694935631773258e-07, + "loss": 0.6774, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.5584664714476052, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6326, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.487255380633235, + "learning_rate": 6.301617681886863e-07, + "loss": 0.7465, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.4350224105217472, + "learning_rate": 6.109409416834688e-07, + "loss": 0.6694, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.5007742206468339, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6576, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.49278031797530814, + "learning_rate": 5.733897176325665e-07, + "loss": 0.6655, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.36776361811801445, + "learning_rate": 5.550594322205504e-07, + "loss": 0.6232, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.43084289840079937, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6286, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.4415496400128871, + "learning_rate": 5.192897883082747e-07, + "loss": 0.6283, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.3450616572510623, + "learning_rate": 5.018505366216175e-07, + "loss": 0.5864, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4149338144520578, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6326, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.38583315875884233, + "learning_rate": 4.678634341683252e-07, + "loss": 0.6516, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.39196067997004447, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6474, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4627428355091497, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6496, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.5094001659759443, + "learning_rate": 4.191120373120749e-07, + "loss": 0.7016, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.40974755328404217, + "learning_rate": 4.034562351727389e-07, + "loss": 0.5765, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.40032544663655756, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.7405, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.41338686182286327, + "learning_rate": 3.73036907948543e-07, + "loss": 0.6478, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.5006313374914235, + "learning_rate": 3.582734737004101e-07, + "loss": 0.6717, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3584787782331542, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6608, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.3957933792772522, + "learning_rate": 3.296392843612273e-07, + "loss": 0.6643, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.4720312848719183, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.6611, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.4435292333643389, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6347, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.41758627561980133, + "learning_rate": 2.889203328748424e-07, + "loss": 0.6293, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.4785062670362954, + "learning_rate": 2.759428007315212e-07, + "loss": 0.6627, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3946963967872426, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.5852, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.42829762589529746, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.6636, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.44071064352491884, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.6802, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3940438253700029, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.5963, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.42639732872760644, + "learning_rate": 2.15522751523467e-07, + "loss": 0.5808, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.4166432355006099, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.6667, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4161754160413017, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6684, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.4216631747912127, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.622, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.710984201933716, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.6509, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.45259145045156, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6937, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.44887693286795777, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.6366, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.42828981202862987, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.5888, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.46509297587211545, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6257, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.4244069577499102, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6163, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.46625633515460524, + "learning_rate": 1.170343437301491e-07, + "loss": 0.7083, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.36187151298797665, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5688, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.4789112174362444, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.6826, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.5058145200460931, + "learning_rate": 9.330275400666332e-08, + "loss": 0.6656, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.4508626575496384, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6407, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.446817396692741, + "learning_rate": 7.8973337634336e-08, + "loss": 0.6505, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.4559128212421954, + "learning_rate": 7.225618800222877e-08, + "loss": 0.7048, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4693187807566613, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5972, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.4303581174526853, + "learning_rate": 5.971710613821291e-08, + "loss": 0.67, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.40639260137853606, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6457, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.42425778703745654, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6568, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.41936464307298676, + "learning_rate": 4.314680098592705e-08, + "loss": 0.6312, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.4194137029906762, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.6429, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.45631605031815015, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6457, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.439101437133196, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.6866, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.4851770734086949, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.669, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4469981928465987, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6478, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.43997181930545687, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.6979, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.4445565582998424, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.6102, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4641788184566056, + "learning_rate": 1.209367398504746e-08, + "loss": 0.5776, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.46727608567067147, + "learning_rate": 9.555535917993297e-09, + "loss": 0.7867, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.4669485338942387, + "learning_rate": 7.315984495548378e-09, + "loss": 0.6466, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.38202673202303844, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6139, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.40543450738181425, + "learning_rate": 3.732667443390181e-09, + "loss": 0.6796, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.46735765973973104, + "learning_rate": 2.388912514017516e-09, + "loss": 0.6674, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.43420510375418675, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5687, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.4329805481272118, + "learning_rate": 5.972299119250125e-10, + "loss": 0.6635, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.5417662911864329, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.6103, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.4466295705323495, + "learning_rate": 0.0, + "loss": 0.6386, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1615624508571648.0, + "train_loss": 0.7279649205525716, + "train_runtime": 29022.034, + "train_samples_per_second": 1.034, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1615624508571648.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6012a4ebec36e2eba6258d3b4595f70f4a64a094 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_proj", + "o_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae50c1d9bfd9d7f87e5ba15c0b44de3afa36f7f6 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba6b277f48d7dfaa5e05a643477dbb65a3549a1dfccfc581101891ed2edb6ad9 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..b65fcb6adfc0c662578d8d8950e1f639bb56effa --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9a20fd34a6c8ca151907d24cd0b8d9b1480f4a9fd387812faa61c971b1da711 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..168989d3270dcde7a3f34471ab90cbe646a6c73e --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.9119700272975201, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.3821, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.0718196643084557, + "learning_rate": 7.017543859649123e-06, + "loss": 1.4152, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 1.1393778386467817, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5635, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 1.0131635593166934, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.4527, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.8693186056499881, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.2904, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9535159566588861, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4675, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.9025498048400175, + "learning_rate": 2.456140350877193e-05, + "loss": 1.2784, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.9991334895096446, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.2612, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8756434808752498, + "learning_rate": 3.157894736842105e-05, + "loss": 1.205, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.900983440467934, + "learning_rate": 3.508771929824561e-05, + "loss": 1.0833, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.8303584673970603, + "learning_rate": 3.859649122807018e-05, + "loss": 1.0393, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8616493818161056, + "learning_rate": 4.210526315789474e-05, + "loss": 1.0162, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.7882583515645812, + "learning_rate": 4.56140350877193e-05, + "loss": 0.9879, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.8990632518216867, + "learning_rate": 4.912280701754386e-05, + "loss": 1.0495, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 0.7193602759554054, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.9614, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.7455038933596019, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.9421, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.7662290819962464, + "learning_rate": 5.9649122807017544e-05, + "loss": 0.9845, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7454656513898643, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0582, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.7822446530803914, + "learning_rate": 6.666666666666667e-05, + "loss": 1.057, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.5982877909907908, + "learning_rate": 7.017543859649122e-05, + "loss": 0.968, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5840007619802585, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9889, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.7592239226578813, + "learning_rate": 7.719298245614036e-05, + "loss": 0.9574, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.5328144086451, + "learning_rate": 8.070175438596491e-05, + "loss": 0.8629, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6020252347294877, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9021, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.7507292569128217, + "learning_rate": 8.771929824561403e-05, + "loss": 1.0836, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.5839345053116907, + "learning_rate": 9.12280701754386e-05, + "loss": 0.9046, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.6215030194976022, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9765, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.4915343259310359, + "learning_rate": 9.824561403508771e-05, + "loss": 0.8249, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.4972319010091224, + "learning_rate": 0.0001017543859649123, + "loss": 0.8724, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.59980842610381, + "learning_rate": 0.00010526315789473685, + "loss": 0.9404, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.46809304950939246, + "learning_rate": 0.00010877192982456141, + "loss": 0.8375, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.5945233102204843, + "learning_rate": 0.00011228070175438597, + "loss": 0.9573, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6260137470165155, + "learning_rate": 0.00011578947368421053, + "loss": 0.9153, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.6023912493485183, + "learning_rate": 0.00011929824561403509, + "loss": 0.9513, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.5556194850064836, + "learning_rate": 0.00012280701754385965, + "loss": 0.8632, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5021439033588748, + "learning_rate": 0.0001263157894736842, + "loss": 0.8663, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.6624496837211223, + "learning_rate": 0.0001298245614035088, + "loss": 0.8901, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.48543070481003414, + "learning_rate": 0.00013333333333333334, + "loss": 0.8713, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5724306483192274, + "learning_rate": 0.0001368421052631579, + "loss": 0.9028, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.49738799856649346, + "learning_rate": 0.00014035087719298245, + "loss": 0.8829, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.5778587860212584, + "learning_rate": 0.00014385964912280703, + "loss": 0.8749, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5426013547625161, + "learning_rate": 0.00014736842105263158, + "loss": 0.8475, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.5662964359473427, + "learning_rate": 0.00015087719298245616, + "loss": 0.9221, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.5117063133824206, + "learning_rate": 0.0001543859649122807, + "loss": 0.8964, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.4969149597813583, + "learning_rate": 0.00015789473684210527, + "loss": 0.8306, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.48172094638505625, + "learning_rate": 0.00016140350877192982, + "loss": 0.8223, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.48071715164871254, + "learning_rate": 0.0001649122807017544, + "loss": 0.8312, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5292675753394324, + "learning_rate": 0.00016842105263157895, + "loss": 0.9317, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.5342209972167025, + "learning_rate": 0.00017192982456140353, + "loss": 0.8673, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.5081316121656525, + "learning_rate": 0.00017543859649122806, + "loss": 0.8913, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5588855560452214, + "learning_rate": 0.00017894736842105264, + "loss": 0.8175, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.5323609152601871, + "learning_rate": 0.0001824561403508772, + "loss": 0.8262, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.4803683672039637, + "learning_rate": 0.00018596491228070177, + "loss": 0.8626, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5118088641558101, + "learning_rate": 0.00018947368421052632, + "loss": 0.8519, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.464785397119558, + "learning_rate": 0.00019298245614035088, + "loss": 0.8197, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.48244517394651254, + "learning_rate": 0.00019649122807017543, + "loss": 0.8196, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.5228604284909696, + "learning_rate": 0.0002, + "loss": 0.9283, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.43974129140484236, + "learning_rate": 0.00019999985069241055, + "loss": 0.7655, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.4925496871049201, + "learning_rate": 0.00019999940277008808, + "loss": 0.7855, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.5410668694562225, + "learning_rate": 0.00019999865623437013, + "loss": 0.8061, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.4921790802433175, + "learning_rate": 0.00019999761108748597, + "loss": 0.7518, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.5760430011798686, + "learning_rate": 0.00019999626733255662, + "loss": 0.9381, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.6915625292235289, + "learning_rate": 0.00019999462497359466, + "loss": 0.9606, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.5017585919637912, + "learning_rate": 0.00019999268401550447, + "loss": 0.8618, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.5509276120158647, + "learning_rate": 0.000199990444464082, + "loss": 0.886, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.43360403605997344, + "learning_rate": 0.00019998790632601496, + "loss": 0.7596, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.49634898390287524, + "learning_rate": 0.00019998506960888256, + "loss": 0.8257, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5745231614633087, + "learning_rate": 0.00019998193432115572, + "loss": 0.8586, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.8266645215289209, + "learning_rate": 0.0001999785004721968, + "loss": 0.8365, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.5477131617262088, + "learning_rate": 0.00019997476807225985, + "loss": 0.8333, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.4705032521122792, + "learning_rate": 0.0001999707371324904, + "loss": 0.7209, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 1.1485917970768864, + "learning_rate": 0.00019996640766492543, + "loss": 0.8667, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.5213355277551339, + "learning_rate": 0.00019996177968249334, + "loss": 0.7743, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.5383053332172496, + "learning_rate": 0.0001999568531990141, + "loss": 0.8704, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.6013015270279097, + "learning_rate": 0.00019995162822919883, + "loss": 0.9075, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.5678333700613905, + "learning_rate": 0.00019994610478865011, + "loss": 0.9107, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.5042385210985819, + "learning_rate": 0.0001999402828938618, + "loss": 0.8281, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5228070045040702, + "learning_rate": 0.00019993416256221895, + "loss": 0.8563, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.48593817752220875, + "learning_rate": 0.00019992774381199778, + "loss": 0.8274, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.45881395235536493, + "learning_rate": 0.00019992102666236566, + "loss": 0.7983, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4694744200503165, + "learning_rate": 0.00019991401113338104, + "loss": 0.8657, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.586378163426478, + "learning_rate": 0.00019990669724599336, + "loss": 0.8803, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.4795277369427708, + "learning_rate": 0.00019989908502204292, + "loss": 0.899, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6019636798520344, + "learning_rate": 0.00019989117448426108, + "loss": 0.889, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.5343643349771755, + "learning_rate": 0.00019988296565626987, + "loss": 0.8313, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.5055538415468328, + "learning_rate": 0.00019987445856258206, + "loss": 0.7763, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5627564761750243, + "learning_rate": 0.00019986565322860115, + "loss": 0.9671, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.48616773415275505, + "learning_rate": 0.00019985654968062122, + "loss": 0.7953, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.42410036246753435, + "learning_rate": 0.00019984714794582683, + "loss": 0.7518, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.5010923642769226, + "learning_rate": 0.00019983744805229296, + "loss": 0.8625, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.5520335058511099, + "learning_rate": 0.000199827450028985, + "loss": 0.9194, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.43426528508465045, + "learning_rate": 0.00019981715390575858, + "loss": 0.794, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.6264895000839313, + "learning_rate": 0.00019980655971335945, + "loss": 0.8819, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.44275750149644866, + "learning_rate": 0.00019979566748342347, + "loss": 0.7832, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.5010512964374146, + "learning_rate": 0.00019978447724847652, + "loss": 0.7201, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4091834743340737, + "learning_rate": 0.00019977298904193437, + "loss": 0.7087, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.5502180992156529, + "learning_rate": 0.00019976120289810247, + "loss": 0.8258, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.5331677987078178, + "learning_rate": 0.00019974911885217608, + "loss": 0.8464, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4818881770102341, + "learning_rate": 0.00019973673694024, + "loss": 0.8135, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.4916236843884564, + "learning_rate": 0.0001997240571992685, + "loss": 0.785, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.48518289942846865, + "learning_rate": 0.00019971107966712518, + "loss": 0.8556, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4791473934410761, + "learning_rate": 0.00019969780438256293, + "loss": 0.7906, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.49151403025248214, + "learning_rate": 0.0001996842313852238, + "loss": 0.8514, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.4832521014843663, + "learning_rate": 0.00019967036071563877, + "loss": 0.7771, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.5078382653849669, + "learning_rate": 0.0001996561924152278, + "loss": 0.8755, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.5340060334983346, + "learning_rate": 0.0001996417265262996, + "loss": 0.8358, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.4805941528332125, + "learning_rate": 0.00019962696309205148, + "loss": 0.8763, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5376214878006288, + "learning_rate": 0.0001996119021565693, + "loss": 0.8399, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.41915303830518985, + "learning_rate": 0.0001995965437648273, + "loss": 0.7516, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.46136233136624094, + "learning_rate": 0.00019958088796268793, + "loss": 0.8391, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4763426334860097, + "learning_rate": 0.0001995649347969019, + "loss": 0.8312, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.5026388350654547, + "learning_rate": 0.00019954868431510764, + "loss": 0.8018, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.6182458559640106, + "learning_rate": 0.00019953213656583168, + "loss": 0.9336, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.527715068970576, + "learning_rate": 0.00019951529159848805, + "loss": 0.8682, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.4447773028873336, + "learning_rate": 0.00019949814946337838, + "loss": 0.7474, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.463959281978166, + "learning_rate": 0.00019948071021169174, + "loss": 0.8171, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.5373781665591376, + "learning_rate": 0.00019946297389550433, + "loss": 0.813, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.4835772506454369, + "learning_rate": 0.00019944494056777946, + "loss": 0.8434, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.4429695324944789, + "learning_rate": 0.00019942661028236745, + "loss": 0.8312, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.4128380918026377, + "learning_rate": 0.00019940798309400526, + "loss": 0.7632, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.5210114678778003, + "learning_rate": 0.00019938905905831654, + "loss": 0.8029, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.44628619243476736, + "learning_rate": 0.00019936983823181132, + "loss": 0.8182, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5332101118781163, + "learning_rate": 0.0001993503206718859, + "loss": 0.8561, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.5342966562026665, + "learning_rate": 0.00019933050643682269, + "loss": 0.8538, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.44840331988829313, + "learning_rate": 0.00019931039558578997, + "loss": 0.7975, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.7289144373966412, + "learning_rate": 0.00019928998817884182, + "loss": 0.8564, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.4942579196886622, + "learning_rate": 0.00019926928427691786, + "loss": 0.8017, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.5188942354342044, + "learning_rate": 0.00019924828394184306, + "loss": 0.7921, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.5287360157098405, + "learning_rate": 0.00019922698723632767, + "loss": 0.7583, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.5252132506997987, + "learning_rate": 0.0001992053942239668, + "loss": 0.9195, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.47226849074210947, + "learning_rate": 0.0001991835049692405, + "loss": 0.8064, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.46085123217104457, + "learning_rate": 0.00019916131953751342, + "loss": 0.8815, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.6590894707714906, + "learning_rate": 0.0001991388379950346, + "loss": 0.8787, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.5029288229631176, + "learning_rate": 0.0001991160604089374, + "loss": 0.7836, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.5211207894918858, + "learning_rate": 0.00019909298684723904, + "loss": 0.7657, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.5465988338616571, + "learning_rate": 0.00019906961737884077, + "loss": 0.8561, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.5809027775381883, + "learning_rate": 0.00019904595207352737, + "loss": 0.7855, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.462816765401537, + "learning_rate": 0.00019902199100196697, + "loss": 0.8331, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.5600373856307657, + "learning_rate": 0.000198997734235711, + "loss": 0.9407, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.6067728214054904, + "learning_rate": 0.00019897318184719385, + "loss": 0.8819, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.49008991813846703, + "learning_rate": 0.00019894833390973266, + "loss": 0.7995, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.5541028209169759, + "learning_rate": 0.0001989231904975272, + "loss": 0.8401, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.464542513012385, + "learning_rate": 0.00019889775168565943, + "loss": 0.7591, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5154056362998911, + "learning_rate": 0.00019887201755009357, + "loss": 0.7447, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.5717212031111361, + "learning_rate": 0.00019884598816767563, + "loss": 0.8583, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.5254911177742544, + "learning_rate": 0.0001988196636161333, + "loss": 0.7924, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4974739978149858, + "learning_rate": 0.0001987930439740757, + "loss": 0.8095, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.5769448315890482, + "learning_rate": 0.00019876612932099308, + "loss": 0.9072, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.5638757489101307, + "learning_rate": 0.0001987389197372567, + "loss": 0.8934, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.5079642645844904, + "learning_rate": 0.00019871141530411853, + "loss": 0.796, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.4475648715258986, + "learning_rate": 0.00019868361610371097, + "loss": 0.7826, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.5782655589773584, + "learning_rate": 0.00019865552221904665, + "loss": 0.9027, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.46136766235987614, + "learning_rate": 0.0001986271337340182, + "loss": 0.7798, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.4989979612996333, + "learning_rate": 0.00019859845073339787, + "loss": 0.7765, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.430880249853866, + "learning_rate": 0.00019856947330283752, + "loss": 0.793, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.622610818905008, + "learning_rate": 0.00019854020152886814, + "loss": 0.9388, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.46427227544234384, + "learning_rate": 0.0001985106354988997, + "loss": 0.8083, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.48434244209571903, + "learning_rate": 0.00019848077530122083, + "loss": 0.7798, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.5185043526212169, + "learning_rate": 0.0001984506210249986, + "loss": 0.8302, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.5218948447299503, + "learning_rate": 0.00019842017276027832, + "loss": 0.7977, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.4929951302850292, + "learning_rate": 0.00019838943059798304, + "loss": 0.7564, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4221293177296331, + "learning_rate": 0.00019835839462991361, + "loss": 0.7325, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.4835300941894702, + "learning_rate": 0.0001983270649487481, + "loss": 0.7834, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.5061016404838357, + "learning_rate": 0.0001982954416480417, + "loss": 0.7837, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.42083367657463067, + "learning_rate": 0.00019826352482222638, + "loss": 0.7589, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.49457450556033916, + "learning_rate": 0.00019823131456661063, + "loss": 0.7924, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.5702855770493767, + "learning_rate": 0.00019819881097737915, + "loss": 0.8282, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5603048847161772, + "learning_rate": 0.00019816601415159263, + "loss": 0.8849, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.5323390999392662, + "learning_rate": 0.00019813292418718732, + "loss": 0.8207, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.5550731344239631, + "learning_rate": 0.0001980995411829749, + "loss": 0.7816, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5467278587957556, + "learning_rate": 0.0001980658652386421, + "loss": 0.795, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.5871770847988043, + "learning_rate": 0.0001980318964547504, + "loss": 0.913, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.42477602952763593, + "learning_rate": 0.0001979976349327357, + "loss": 0.8453, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4775436168861191, + "learning_rate": 0.00019796308077490817, + "loss": 0.8257, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.5809533542417403, + "learning_rate": 0.00019792823408445174, + "loss": 0.8422, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.5223553052307842, + "learning_rate": 0.0001978930949654239, + "loss": 0.9074, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.6509701615774982, + "learning_rate": 0.00019785766352275542, + "loss": 0.8967, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.5107337996414248, + "learning_rate": 0.00019782193986224995, + "loss": 0.8419, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.48571779122474323, + "learning_rate": 0.00019778592409058378, + "loss": 0.8223, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.502817573421187, + "learning_rate": 0.00019774961631530545, + "loss": 0.7862, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.47996959799300476, + "learning_rate": 0.0001977130166448355, + "loss": 0.8256, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.5004879222180794, + "learning_rate": 0.00019767612518846608, + "loss": 0.8276, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.44055728969726077, + "learning_rate": 0.00019763894205636072, + "loss": 0.767, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.48817412888697204, + "learning_rate": 0.00019760146735955388, + "loss": 0.8126, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.5026490948743149, + "learning_rate": 0.00019756370120995066, + "loss": 0.7725, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.42242925801537123, + "learning_rate": 0.00019752564372032657, + "loss": 0.7868, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.502595850937941, + "learning_rate": 0.000197487295004327, + "loss": 0.8072, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.4536894309981548, + "learning_rate": 0.00019744865517646706, + "loss": 0.8315, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.49222831809950096, + "learning_rate": 0.00019740972435213115, + "loss": 0.8421, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.6071361839588753, + "learning_rate": 0.0001973705026475726, + "loss": 0.797, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.4556865828533464, + "learning_rate": 0.00019733099017991341, + "loss": 0.7521, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5357536923095771, + "learning_rate": 0.00019729118706714375, + "loss": 0.765, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.45300643748452357, + "learning_rate": 0.0001972510934281218, + "loss": 0.7924, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.4150781235551886, + "learning_rate": 0.00019721070938257324, + "loss": 0.7856, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.5396250966839642, + "learning_rate": 0.00019717003505109095, + "loss": 0.834, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.5777663052587192, + "learning_rate": 0.0001971290705551347, + "loss": 0.8661, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.41978092651639354, + "learning_rate": 0.00019708781601703065, + "loss": 0.8292, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4327189342629885, + "learning_rate": 0.00019704627155997108, + "loss": 0.7917, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.5339549918217065, + "learning_rate": 0.00019700443730801413, + "loss": 0.843, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.6567496789400608, + "learning_rate": 0.00019696231338608316, + "loss": 0.9519, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.48212135366814757, + "learning_rate": 0.00019691989991996663, + "loss": 0.8556, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.4513003393009851, + "learning_rate": 0.00019687719703631755, + "loss": 0.816, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.5945292664973479, + "learning_rate": 0.00019683420486265327, + "loss": 0.8212, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5674758834098058, + "learning_rate": 0.0001967909235273549, + "loss": 0.9254, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.5049248316067212, + "learning_rate": 0.0001967473531596671, + "loss": 0.8058, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.4310675636099357, + "learning_rate": 0.0001967034938896976, + "loss": 0.7619, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5688076725316934, + "learning_rate": 0.00019665934584841682, + "loss": 0.894, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.4825263353941901, + "learning_rate": 0.0001966149091676575, + "loss": 0.8716, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.5204282839861062, + "learning_rate": 0.00019657018398011434, + "loss": 0.7731, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.48193418900741986, + "learning_rate": 0.00019652517041934356, + "loss": 0.7857, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.5923084939770342, + "learning_rate": 0.00019647986861976246, + "loss": 0.8693, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.4346421240251085, + "learning_rate": 0.0001964342787166491, + "loss": 0.8206, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.41518975595943, + "learning_rate": 0.00019638840084614182, + "loss": 0.7725, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.5412131324371964, + "learning_rate": 0.0001963422351452389, + "loss": 0.7835, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.413041307949752, + "learning_rate": 0.0001962957817517982, + "loss": 0.7578, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4350353314909387, + "learning_rate": 0.00019624904080453655, + "loss": 0.745, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.5156924530388096, + "learning_rate": 0.00019620201244302952, + "loss": 0.8341, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.5216251483099174, + "learning_rate": 0.00019615469680771096, + "loss": 0.8143, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4847475901770463, + "learning_rate": 0.00019610709403987246, + "loss": 0.8239, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.5130061024698517, + "learning_rate": 0.00019605920428166323, + "loss": 0.8101, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.5627559370642568, + "learning_rate": 0.00019601102767608923, + "loss": 0.905, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5760016774779814, + "learning_rate": 0.00019596256436701324, + "loss": 0.878, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.697745543680888, + "learning_rate": 0.00019591381449915397, + "loss": 0.8782, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.45254804088248957, + "learning_rate": 0.00019586477821808597, + "loss": 0.789, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.5177321917758226, + "learning_rate": 0.000195815455670239, + "loss": 0.8278, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.6542211526689103, + "learning_rate": 0.00019576584700289768, + "loss": 0.8947, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.46088993389214256, + "learning_rate": 0.00019571595236420102, + "loss": 0.8063, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.44168806734271926, + "learning_rate": 0.00019566577190314197, + "loss": 0.7994, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.4893573959878582, + "learning_rate": 0.00019561530576956703, + "loss": 0.7875, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.4402039706479555, + "learning_rate": 0.00019556455411417573, + "loss": 0.7653, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.5503484096259508, + "learning_rate": 0.0001955135170885202, + "loss": 0.8052, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.445732614986419, + "learning_rate": 0.00019546219484500475, + "loss": 0.7911, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.4683586447728192, + "learning_rate": 0.00019541058753688538, + "loss": 0.8049, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.43590689675821337, + "learning_rate": 0.00019535869531826937, + "loss": 0.7289, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.4313722639335353, + "learning_rate": 0.00019530651834411474, + "loss": 0.7321, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.4977979563104626, + "learning_rate": 0.00019525405677022989, + "loss": 0.7528, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.5386795150252797, + "learning_rate": 0.00019520131075327298, + "loss": 0.8602, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.4647156785894839, + "learning_rate": 0.0001951482804507517, + "loss": 0.7707, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.43784545150260307, + "learning_rate": 0.00019509496602102252, + "loss": 0.7433, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.5418414036525606, + "learning_rate": 0.00019504136762329047, + "loss": 0.8628, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.49907138618976254, + "learning_rate": 0.00019498748541760846, + "loss": 0.8055, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.4478793098798483, + "learning_rate": 0.0001949333195648769, + "loss": 0.7474, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.7411814406167593, + "learning_rate": 0.00019487887022684336, + "loss": 0.8965, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.4671607046294163, + "learning_rate": 0.00019482413756610173, + "loss": 0.7168, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.5183060058121355, + "learning_rate": 0.0001947691217460921, + "loss": 0.7327, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.45676797841041233, + "learning_rate": 0.00019471382293110003, + "loss": 0.8069, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.5003422880277102, + "learning_rate": 0.00019465824128625617, + "loss": 0.8096, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.4720046110864816, + "learning_rate": 0.00019460237697753577, + "loss": 0.7936, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4659375231388962, + "learning_rate": 0.00019454623017175812, + "loss": 0.8447, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.3991832637400293, + "learning_rate": 0.00019448980103658613, + "loss": 0.7613, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.46609744715715673, + "learning_rate": 0.0001944330897405257, + "loss": 0.8139, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5473569722098224, + "learning_rate": 0.00019437609645292546, + "loss": 0.8917, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.5125319162989075, + "learning_rate": 0.00019431882134397598, + "loss": 0.8458, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.44520219411089196, + "learning_rate": 0.00019426126458470936, + "loss": 0.7899, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.4939429098502889, + "learning_rate": 0.0001942034263469989, + "loss": 0.8231, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.5186461089228898, + "learning_rate": 0.00019414530680355837, + "loss": 0.819, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.43812957642139744, + "learning_rate": 0.00019408690612794148, + "loss": 0.7547, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4768485119368363, + "learning_rate": 0.00019402822449454153, + "loss": 0.7975, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.5438896641929261, + "learning_rate": 0.00019396926207859084, + "loss": 0.876, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.5040362007931832, + "learning_rate": 0.0001939100190561601, + "loss": 0.8342, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5907767181497797, + "learning_rate": 0.00019385049560415794, + "loss": 0.8547, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.4134578829391838, + "learning_rate": 0.0001937906919003304, + "loss": 0.7977, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.49573855042537013, + "learning_rate": 0.00019373060812326052, + "loss": 0.7871, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5376762627254102, + "learning_rate": 0.00019367024445236754, + "loss": 0.7686, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.4271734633171038, + "learning_rate": 0.00019360960106790643, + "loss": 0.7099, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.48891927309885547, + "learning_rate": 0.0001935486781509677, + "loss": 0.8501, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.46702103354673885, + "learning_rate": 0.00019348747588347637, + "loss": 0.7685, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.6714440742270075, + "learning_rate": 0.00019342599444819168, + "loss": 0.8112, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.46719158465041644, + "learning_rate": 0.00019336423402870653, + "loss": 0.7317, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.4731175780188826, + "learning_rate": 0.00019330219480944694, + "loss": 0.7222, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.4813435196214086, + "learning_rate": 0.0001932398769756714, + "loss": 0.8823, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.5365664793147633, + "learning_rate": 0.0001931772807134704, + "loss": 0.7758, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.5151121319660465, + "learning_rate": 0.00019311440620976597, + "loss": 0.8531, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.4507442499257661, + "learning_rate": 0.00019305125365231084, + "loss": 0.7806, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.49363095971705245, + "learning_rate": 0.00019298782322968815, + "loss": 0.7851, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4239664546814916, + "learning_rate": 0.0001929241151313108, + "loss": 0.7667, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.5449213157291893, + "learning_rate": 0.0001928601295474208, + "loss": 0.8113, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.5308694195693685, + "learning_rate": 0.00019279586666908884, + "loss": 0.7687, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.5226657003524614, + "learning_rate": 0.00019273132668821364, + "loss": 0.7762, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.5066598445384889, + "learning_rate": 0.00019266650979752136, + "loss": 0.8752, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.4451461004102294, + "learning_rate": 0.00019260141619056507, + "loss": 0.7267, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.8399821417921948, + "learning_rate": 0.00019253604606172417, + "loss": 0.9547, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.4897670468294, + "learning_rate": 0.0001924703996062038, + "loss": 0.8311, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.48818058616095594, + "learning_rate": 0.0001924044770200342, + "loss": 0.7179, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.4419529212954025, + "learning_rate": 0.00019233827850007027, + "loss": 0.8136, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.4023698844938176, + "learning_rate": 0.0001922718042439908, + "loss": 0.7455, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.47105044808585567, + "learning_rate": 0.000192205054450298, + "loss": 0.8503, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5408142824082688, + "learning_rate": 0.00019213802931831696, + "loss": 0.7178, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.5581117645528779, + "learning_rate": 0.00019207072904819486, + "loss": 0.8308, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.493046943947969, + "learning_rate": 0.00019200315384090044, + "loss": 0.8241, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.48141813650755766, + "learning_rate": 0.00019193530389822363, + "loss": 0.7823, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.5783959120067039, + "learning_rate": 0.00019186717942277462, + "loss": 0.8724, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.5329112197466159, + "learning_rate": 0.00019179878061798347, + "loss": 0.8125, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4479424873064313, + "learning_rate": 0.00019173010768809933, + "loss": 0.7899, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.4376026788812274, + "learning_rate": 0.00019166116083819002, + "loss": 0.7518, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.5184722445808722, + "learning_rate": 0.00019159194027414128, + "loss": 0.7547, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4561139568161695, + "learning_rate": 0.0001915224462026563, + "loss": 0.7951, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.4938852643093283, + "learning_rate": 0.00019145267883125482, + "loss": 0.7797, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.4819159951909378, + "learning_rate": 0.00019138263836827288, + "loss": 0.7643, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.5486220244815267, + "learning_rate": 0.00019131232502286188, + "loss": 0.788, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.5045180731593369, + "learning_rate": 0.00019124173900498818, + "loss": 0.7667, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.5181599899043072, + "learning_rate": 0.00019117088052543233, + "loss": 0.8043, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4923806565929291, + "learning_rate": 0.0001910997497957885, + "loss": 0.8508, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.419136356006891, + "learning_rate": 0.00019102834702846387, + "loss": 0.7943, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.4455194340299678, + "learning_rate": 0.0001909566724366779, + "loss": 0.7328, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4945754793624841, + "learning_rate": 0.00019088472623446183, + "loss": 0.8475, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.4420538266510707, + "learning_rate": 0.00019081250863665794, + "loss": 0.8184, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.4269060032710139, + "learning_rate": 0.0001907400198589189, + "loss": 0.7401, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.5099723371851572, + "learning_rate": 0.00019066726011770726, + "loss": 0.845, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.4434519471106178, + "learning_rate": 0.00019059422963029464, + "loss": 0.7504, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.43784132154621286, + "learning_rate": 0.0001905209286147611, + "loss": 0.7089, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.43873857004609185, + "learning_rate": 0.0001904473572899947, + "loss": 0.7002, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.4506345032265064, + "learning_rate": 0.0001903735158756905, + "loss": 0.7988, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.4544785800442285, + "learning_rate": 0.0001902994045923502, + "loss": 0.7629, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.5072986473772981, + "learning_rate": 0.00019022502366128135, + "loss": 0.7645, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.48380278700417223, + "learning_rate": 0.0001901503733045967, + "loss": 0.7708, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.5401366720008375, + "learning_rate": 0.00019007545374521355, + "loss": 0.8885, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.40824039395235506, + "learning_rate": 0.00019000026520685302, + "loss": 0.7431, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.4773952792733267, + "learning_rate": 0.00018992480791403958, + "loss": 0.7904, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.5147375115771243, + "learning_rate": 0.0001898490820921001, + "loss": 0.7826, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4967881434720466, + "learning_rate": 0.0001897730879671634, + "loss": 0.7921, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.4020471288283699, + "learning_rate": 0.0001896968257661595, + "loss": 0.7395, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.5547001347028977, + "learning_rate": 0.00018962029571681886, + "loss": 0.8265, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5937416350965022, + "learning_rate": 0.00018954349804767184, + "loss": 0.6799, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.47119996583636425, + "learning_rate": 0.00018946643298804793, + "loss": 0.7639, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.47514075140957784, + "learning_rate": 0.00018938910076807513, + "loss": 0.756, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.5430729858471183, + "learning_rate": 0.00018931150161867916, + "loss": 0.8153, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.4392242790602314, + "learning_rate": 0.0001892336357715829, + "loss": 0.7723, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.43309400523514135, + "learning_rate": 0.0001891555034593055, + "loss": 0.7881, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.4428285074966686, + "learning_rate": 0.00018907710491516199, + "loss": 0.8107, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.39498572960290707, + "learning_rate": 0.00018899844037326225, + "loss": 0.7057, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.45384895759122684, + "learning_rate": 0.0001889195100685106, + "loss": 0.757, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4645589158513839, + "learning_rate": 0.0001888403142366049, + "loss": 0.7917, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.4828225303507632, + "learning_rate": 0.00018876085311403593, + "loss": 0.7706, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.41676161155491853, + "learning_rate": 0.00018868112693808665, + "loss": 0.8008, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.40768148396249776, + "learning_rate": 0.00018860113594683148, + "loss": 0.7125, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.4206330022531057, + "learning_rate": 0.00018852088037913577, + "loss": 0.708, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.45852669946674146, + "learning_rate": 0.0001884403604746547, + "loss": 0.7252, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.5187458223572374, + "learning_rate": 0.00018835957647383303, + "loss": 0.8094, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.4218113268112975, + "learning_rate": 0.00018827852861790398, + "loss": 0.7457, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.4779548315246646, + "learning_rate": 0.00018819721714888877, + "loss": 0.7733, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5305887385974879, + "learning_rate": 0.00018811564230959588, + "loss": 0.794, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.4436089689306347, + "learning_rate": 0.00018803380434362, + "loss": 0.7014, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.505347922907718, + "learning_rate": 0.0001879517034953418, + "loss": 0.8379, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.5307748724766961, + "learning_rate": 0.00018786934000992688, + "loss": 0.7534, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.4603281559104605, + "learning_rate": 0.00018778671413332513, + "loss": 0.6725, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.4448136344506952, + "learning_rate": 0.00018770382611226987, + "loss": 0.7645, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4500479016642222, + "learning_rate": 0.00018762067619427746, + "loss": 0.7326, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.44498235610391224, + "learning_rate": 0.000187537264627646, + "loss": 0.7449, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.4481725927338574, + "learning_rate": 0.00018745359166145523, + "loss": 0.7592, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5038906250643743, + "learning_rate": 0.00018736965754556528, + "loss": 0.7622, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.4891318264847687, + "learning_rate": 0.00018728546253061614, + "loss": 0.7279, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.46863427713366296, + "learning_rate": 0.00018720100686802694, + "loss": 0.8108, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5081538565443008, + "learning_rate": 0.00018711629080999504, + "loss": 0.851, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.44156652490572995, + "learning_rate": 0.00018703131460949554, + "loss": 0.7617, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.4222184947608319, + "learning_rate": 0.0001869460785202802, + "loss": 0.738, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.487228890741418, + "learning_rate": 0.00018686058279687698, + "loss": 0.8004, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.5018825649734135, + "learning_rate": 0.00018677482769458904, + "loss": 0.8676, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.6506669771279707, + "learning_rate": 0.00018668881346949417, + "loss": 0.862, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.6038723576749023, + "learning_rate": 0.00018660254037844388, + "loss": 0.9103, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.4424814885937773, + "learning_rate": 0.00018651600867906272, + "loss": 0.7687, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.46165403935919586, + "learning_rate": 0.00018642921862974742, + "loss": 0.8191, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.5007573545248332, + "learning_rate": 0.00018634217048966637, + "loss": 0.845, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.5423783522464709, + "learning_rate": 0.00018625486451875843, + "loss": 0.7283, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.46307366479277484, + "learning_rate": 0.0001861673009777325, + "loss": 0.8054, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.392444456670787, + "learning_rate": 0.0001860794801280666, + "loss": 0.7268, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.5157382155851634, + "learning_rate": 0.00018599140223200716, + "loss": 0.8073, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.6019794040542387, + "learning_rate": 0.0001859030675525681, + "loss": 0.7911, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.5257744626167561, + "learning_rate": 0.0001858144763535302, + "loss": 0.8577, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.405993749808719, + "learning_rate": 0.0001857256288994402, + "loss": 0.7631, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.476821211198031, + "learning_rate": 0.00018563652545561013, + "loss": 0.8177, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3908364341021406, + "learning_rate": 0.0001855471662881164, + "loss": 0.6799, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.46398208229538557, + "learning_rate": 0.000185457551663799, + "loss": 0.7424, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.4222843288943092, + "learning_rate": 0.00018536768185026083, + "loss": 0.7429, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.46262208566459617, + "learning_rate": 0.00018527755711586678, + "loss": 0.7788, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.4464732188169139, + "learning_rate": 0.00018518717772974302, + "loss": 0.7253, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.45794068390259507, + "learning_rate": 0.00018509654396177609, + "loss": 0.7007, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4100964140410785, + "learning_rate": 0.00018500565608261214, + "loss": 0.7254, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.5915915519935746, + "learning_rate": 0.00018491451436365627, + "loss": 0.916, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.4283652869423221, + "learning_rate": 0.0001848231190770714, + "loss": 0.698, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.4997935155568232, + "learning_rate": 0.00018473147049577774, + "loss": 0.8432, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.5255887998793641, + "learning_rate": 0.00018463956889345194, + "loss": 0.7727, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.4563515619886789, + "learning_rate": 0.00018454741454452603, + "loss": 0.8111, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.44738489833947603, + "learning_rate": 0.00018445500772418697, + "loss": 0.781, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.434618116840989, + "learning_rate": 0.00018436234870837547, + "loss": 0.7733, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.47343657771952574, + "learning_rate": 0.00018426943777378552, + "loss": 0.7037, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4226790047438053, + "learning_rate": 0.00018417627519786315, + "loss": 0.7037, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.48576369552246895, + "learning_rate": 0.00018408286125880604, + "loss": 0.7652, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.3921296847107615, + "learning_rate": 0.00018398919623556238, + "loss": 0.7187, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.45799870157437206, + "learning_rate": 0.00018389528040783012, + "loss": 0.7962, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.46374413527372166, + "learning_rate": 0.0001838011140560562, + "loss": 0.8047, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.6108939178009056, + "learning_rate": 0.00018370669746143564, + "loss": 0.7613, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.5203855562061216, + "learning_rate": 0.00018361203090591071, + "loss": 0.8278, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.4793065235133658, + "learning_rate": 0.0001835171146721701, + "loss": 0.8114, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.49522060969880577, + "learning_rate": 0.00018342194904364813, + "loss": 0.7992, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5669903074697817, + "learning_rate": 0.00018332653430452376, + "loss": 0.7945, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.5845667221017729, + "learning_rate": 0.00018323087073971993, + "loss": 0.8268, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.4377080202856934, + "learning_rate": 0.00018313495863490258, + "loss": 0.7976, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.43160031783513936, + "learning_rate": 0.00018303879827647975, + "loss": 0.7563, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.4413100309779214, + "learning_rate": 0.00018294238995160094, + "loss": 0.7697, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.44401477827582986, + "learning_rate": 0.00018284573394815597, + "loss": 0.7469, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.49848698437485534, + "learning_rate": 0.00018274883055477436, + "loss": 0.7272, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.47428272112916825, + "learning_rate": 0.00018265168006082437, + "loss": 0.7728, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4284982233684095, + "learning_rate": 0.00018255428275641214, + "loss": 0.7661, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.5216743563351581, + "learning_rate": 0.00018245663893238075, + "loss": 0.7863, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.44987741751980753, + "learning_rate": 0.0001823587488803095, + "loss": 0.6892, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.5144487920497057, + "learning_rate": 0.00018226061289251298, + "loss": 0.7945, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.45681089238630984, + "learning_rate": 0.00018216223126204007, + "loss": 0.7303, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.40916256219685515, + "learning_rate": 0.00018206360428267332, + "loss": 0.6955, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.44578108377027037, + "learning_rate": 0.00018196473224892784, + "loss": 0.7529, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.3928321523460676, + "learning_rate": 0.00018186561545605054, + "loss": 0.668, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.44742959965839746, + "learning_rate": 0.0001817662542000192, + "loss": 0.7845, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.43637108165151556, + "learning_rate": 0.0001816666487775416, + "loss": 0.7247, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4702710993141143, + "learning_rate": 0.00018156679948605467, + "loss": 0.7579, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.43545505247112853, + "learning_rate": 0.00018146670662372354, + "loss": 0.7147, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.5139992211382781, + "learning_rate": 0.0001813663704894407, + "loss": 0.782, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.5152609439061868, + "learning_rate": 0.00018126579138282503, + "loss": 0.8259, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.45745088126524003, + "learning_rate": 0.00018116496960422107, + "loss": 0.6755, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.4291324050724361, + "learning_rate": 0.00018106390545469795, + "loss": 0.7133, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.5518909640141185, + "learning_rate": 0.0001809625992360485, + "loss": 0.7545, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.5559425812819749, + "learning_rate": 0.00018086105125078857, + "loss": 0.7434, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.49879588892898163, + "learning_rate": 0.00018075926180215576, + "loss": 0.8135, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.43629817554003497, + "learning_rate": 0.00018065723119410884, + "loss": 0.795, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.5605025168067477, + "learning_rate": 0.0001805549597313267, + "loss": 0.7445, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.4156512765357945, + "learning_rate": 0.0001804524477192075, + "loss": 0.7397, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4186968790638674, + "learning_rate": 0.00018034969546386757, + "loss": 0.7156, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.4383726287017184, + "learning_rate": 0.00018024670327214084, + "loss": 0.8117, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.4508631691000904, + "learning_rate": 0.00018014347145157755, + "loss": 0.7327, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4561001826749441, + "learning_rate": 0.0001800400003104436, + "loss": 0.781, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.3923978069667923, + "learning_rate": 0.0001799362901577196, + "loss": 0.6963, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.49431958857673547, + "learning_rate": 0.00017983234130309968, + "loss": 0.8463, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5327542327158376, + "learning_rate": 0.00017972815405699103, + "loss": 0.7743, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.40841816323592084, + "learning_rate": 0.00017962372873051252, + "loss": 0.7442, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.41469439312498524, + "learning_rate": 0.00017951906563549397, + "loss": 0.7546, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.5187820639244829, + "learning_rate": 0.00017941416508447536, + "loss": 0.8284, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.4814434766699283, + "learning_rate": 0.00017930902739070562, + "loss": 0.7562, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.5158481600615762, + "learning_rate": 0.00017920365286814183, + "loss": 0.8588, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.42692458496058494, + "learning_rate": 0.0001790980418314484, + "loss": 0.7278, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.47341159773000596, + "learning_rate": 0.0001789921945959958, + "loss": 0.7832, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.4658179251976753, + "learning_rate": 0.00017888611147786002, + "loss": 0.7859, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.426747121759328, + "learning_rate": 0.00017877979279382135, + "loss": 0.7149, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.4424751358594917, + "learning_rate": 0.00017867323886136348, + "loss": 0.713, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.534590992906093, + "learning_rate": 0.00017856644999867264, + "loss": 0.7878, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.41020810584542494, + "learning_rate": 0.0001784594265246366, + "loss": 0.6778, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.5089599673121381, + "learning_rate": 0.00017835216875884368, + "loss": 0.7577, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.44368525837594225, + "learning_rate": 0.0001782446770215819, + "loss": 0.8019, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.39948427279118004, + "learning_rate": 0.0001781369516338378, + "loss": 0.7178, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.6373508403094503, + "learning_rate": 0.00017802899291729585, + "loss": 0.802, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.45969661521525756, + "learning_rate": 0.0001779208011943371, + "loss": 0.8592, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.41461759877326615, + "learning_rate": 0.00017781237678803847, + "loss": 0.8089, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.461235913306641, + "learning_rate": 0.00017770372002217172, + "loss": 0.7495, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.4790846421787979, + "learning_rate": 0.00017759483122120238, + "loss": 0.8091, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4246111144159431, + "learning_rate": 0.000177485710710289, + "loss": 0.7368, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.45585421366298007, + "learning_rate": 0.00017737635881528196, + "loss": 0.6921, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.5284232309591278, + "learning_rate": 0.00017726677586272263, + "loss": 0.8409, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5084751662011151, + "learning_rate": 0.00017715696217984235, + "loss": 0.7088, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.4935359417276924, + "learning_rate": 0.00017704691809456143, + "loss": 0.7668, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.7144664711570883, + "learning_rate": 0.0001769366439354882, + "loss": 0.8616, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.38572961600312905, + "learning_rate": 0.00017682614003191807, + "loss": 0.7047, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.4413409728911689, + "learning_rate": 0.00017671540671383243, + "loss": 0.6788, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.5071092067644272, + "learning_rate": 0.0001766044443118978, + "loss": 0.8143, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4537530113046635, + "learning_rate": 0.00017649325315746478, + "loss": 0.7413, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.4824048413518666, + "learning_rate": 0.00017638183358256696, + "loss": 0.7409, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.4999336666430477, + "learning_rate": 0.00017627018591992018, + "loss": 0.8097, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.47620574127289, + "learning_rate": 0.0001761583105029213, + "loss": 0.8534, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.4437154788572927, + "learning_rate": 0.00017604620766564723, + "loss": 0.6857, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.490726925790608, + "learning_rate": 0.00017593387774285412, + "loss": 0.7347, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5480906633151426, + "learning_rate": 0.00017582132106997616, + "loss": 0.8543, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.46751688012032416, + "learning_rate": 0.0001757085379831246, + "loss": 0.7774, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.5012425805809494, + "learning_rate": 0.00017559552881908695, + "loss": 0.767, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 1.0992435072970104, + "learning_rate": 0.00017548229391532572, + "loss": 0.8206, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.41011198381290903, + "learning_rate": 0.00017536883360997743, + "loss": 0.7132, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.4851822022697597, + "learning_rate": 0.00017525514824185185, + "loss": 0.8222, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.42959752142849994, + "learning_rate": 0.00017514123815043074, + "loss": 0.7622, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.4016411050805096, + "learning_rate": 0.00017502710367586687, + "loss": 0.7147, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.3769853438689814, + "learning_rate": 0.0001749127451589832, + "loss": 0.716, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4565265477451086, + "learning_rate": 0.00017479816294127152, + "loss": 0.733, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.47916721260073963, + "learning_rate": 0.00017468335736489177, + "loss": 0.7493, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.4264583107936126, + "learning_rate": 0.00017456832877267084, + "loss": 0.7257, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.4297826533229006, + "learning_rate": 0.0001744530775081015, + "loss": 0.7708, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.45047366153254836, + "learning_rate": 0.00017433760391534167, + "loss": 0.8296, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.4245796722615162, + "learning_rate": 0.00017422190833921283, + "loss": 0.7569, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.45489281736063836, + "learning_rate": 0.0001741059911251997, + "loss": 0.7745, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.5182743917225674, + "learning_rate": 0.00017398985261944856, + "loss": 0.7437, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.40290214610290087, + "learning_rate": 0.00017387349316876666, + "loss": 0.7179, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5148862084632833, + "learning_rate": 0.000173756913120621, + "loss": 0.7906, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.605194359947896, + "learning_rate": 0.0001736401128231373, + "loss": 0.7882, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.4361179650333874, + "learning_rate": 0.00017352309262509894, + "loss": 0.7683, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5566658095073518, + "learning_rate": 0.00017340585287594604, + "loss": 0.8477, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.4514734220833295, + "learning_rate": 0.0001732883939257742, + "loss": 0.8213, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.4421968839147472, + "learning_rate": 0.0001731707161253338, + "loss": 0.7037, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.42301251886582336, + "learning_rate": 0.0001730528198260285, + "loss": 0.747, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.3805910704634265, + "learning_rate": 0.00017293470537991463, + "loss": 0.7443, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.4933055681343728, + "learning_rate": 0.00017281637313969978, + "loss": 0.8022, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.3707160728618861, + "learning_rate": 0.00017269782345874203, + "loss": 0.7278, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.49277663481926737, + "learning_rate": 0.00017257905669104874, + "loss": 0.8518, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.46835116809800187, + "learning_rate": 0.00017246007319127545, + "loss": 0.7389, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4746251387265908, + "learning_rate": 0.00017234087331472497, + "loss": 0.7445, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.5137219752720172, + "learning_rate": 0.00017222145741734626, + "loss": 0.7075, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.43526178449759145, + "learning_rate": 0.00017210182585573327, + "loss": 0.7448, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.5320286884413845, + "learning_rate": 0.00017198197898712404, + "loss": 0.8117, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.36948366328283744, + "learning_rate": 0.00017186191716939944, + "loss": 0.6865, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.8508520461548158, + "learning_rate": 0.0001717416407610824, + "loss": 0.7845, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.5067628459496808, + "learning_rate": 0.00017162115012133643, + "loss": 0.8046, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.5182405956683679, + "learning_rate": 0.00017150044560996488, + "loss": 0.8451, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.466922838762588, + "learning_rate": 0.00017137952758740978, + "loss": 0.7409, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.49081381268238045, + "learning_rate": 0.00017125839641475072, + "loss": 0.6975, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.4706872165534357, + "learning_rate": 0.00017113705245370368, + "loss": 0.8513, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.5646184167803093, + "learning_rate": 0.00017101549606662024, + "loss": 0.8282, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.4222250521775234, + "learning_rate": 0.00017089372761648616, + "loss": 0.7423, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.49523475485097457, + "learning_rate": 0.00017077174746692056, + "loss": 0.8919, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.5167773312757417, + "learning_rate": 0.00017064955598217462, + "loss": 0.8549, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.458375374443549, + "learning_rate": 0.00017052715352713075, + "loss": 0.7527, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.5150099768348072, + "learning_rate": 0.00017040454046730115, + "loss": 0.8448, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.45541857275671144, + "learning_rate": 0.00017028171716882714, + "loss": 0.7478, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.46213514087870106, + "learning_rate": 0.00017015868399847768, + "loss": 0.734, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.49544205271878083, + "learning_rate": 0.00017003544132364846, + "loss": 0.7522, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.496244994144503, + "learning_rate": 0.00016991198951236088, + "loss": 0.7419, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4508646273754306, + "learning_rate": 0.00016978832893326074, + "loss": 0.736, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.40179344760412944, + "learning_rate": 0.00016966445995561727, + "loss": 0.645, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.5612524453527867, + "learning_rate": 0.00016954038294932216, + "loss": 0.8474, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4310080732108951, + "learning_rate": 0.00016941609828488807, + "loss": 0.7457, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.46411264235863475, + "learning_rate": 0.0001692916063334479, + "loss": 0.7787, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.4089350662590114, + "learning_rate": 0.0001691669074667535, + "loss": 0.7444, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.4803026477066216, + "learning_rate": 0.0001690420020571747, + "loss": 0.799, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.4877188647636562, + "learning_rate": 0.0001689168904776979, + "loss": 0.774, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.6431853537583769, + "learning_rate": 0.00016879157310192535, + "loss": 0.7877, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.37497102337352045, + "learning_rate": 0.0001686660503040737, + "loss": 0.7186, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.6827334487348299, + "learning_rate": 0.00016854032245897308, + "loss": 0.8863, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.40588936339350645, + "learning_rate": 0.00016841438994206595, + "loss": 0.68, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4215497216503177, + "learning_rate": 0.00016828825312940592, + "loss": 0.7247, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.7831930794444195, + "learning_rate": 0.00016816191239765667, + "loss": 0.8478, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.5061159945992103, + "learning_rate": 0.00016803536812409075, + "loss": 0.8517, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4890160436690848, + "learning_rate": 0.0001679086206865886, + "loss": 0.799, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.5272230919346497, + "learning_rate": 0.00016778167046363734, + "loss": 0.722, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.44480060503257196, + "learning_rate": 0.00016765451783432953, + "loss": 0.7527, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.40239636808342866, + "learning_rate": 0.00016752716317836229, + "loss": 0.7641, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.40126641105842026, + "learning_rate": 0.0001673996068760359, + "loss": 0.7094, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.3811150028579922, + "learning_rate": 0.00016727184930825288, + "loss": 0.6642, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.5752993294933703, + "learning_rate": 0.0001671438908565167, + "loss": 0.8616, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.39481132125794266, + "learning_rate": 0.00016701573190293077, + "loss": 0.731, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.3999058489020813, + "learning_rate": 0.00016688737283019706, + "loss": 0.6489, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4703230878149846, + "learning_rate": 0.00016675881402161536, + "loss": 0.7545, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.49371013141131853, + "learning_rate": 0.00016663005586108176, + "loss": 0.7686, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.4765619157691119, + "learning_rate": 0.00016650109873308765, + "loss": 0.739, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.441853537963337, + "learning_rate": 0.0001663719430227186, + "loss": 0.7868, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.45815936906330557, + "learning_rate": 0.0001662425891156531, + "loss": 0.7086, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.4272721732662985, + "learning_rate": 0.00016611303739816168, + "loss": 0.7447, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4785296959558227, + "learning_rate": 0.00016598328825710533, + "loss": 0.765, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.5004299400117509, + "learning_rate": 0.00016585334207993476, + "loss": 0.8165, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.47233982615106174, + "learning_rate": 0.00016572319925468892, + "loss": 0.705, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4665555158010926, + "learning_rate": 0.000165592860169994, + "loss": 0.7443, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.460903940185544, + "learning_rate": 0.0001654623252150624, + "loss": 0.7805, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.46508206683156555, + "learning_rate": 0.00016533159477969122, + "loss": 0.715, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.4468702112243043, + "learning_rate": 0.00016520066925426144, + "loss": 0.6781, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.42634830048222383, + "learning_rate": 0.00016506954902973655, + "loss": 0.7117, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.47911788834227576, + "learning_rate": 0.00016493823449766136, + "loss": 0.7687, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.43085295474765856, + "learning_rate": 0.0001648067260501611, + "loss": 0.6401, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.564293491424751, + "learning_rate": 0.00016467502407993992, + "loss": 0.8547, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.4291888973881084, + "learning_rate": 0.0001645431289802799, + "loss": 0.7025, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.44075763650614463, + "learning_rate": 0.0001644110411450398, + "loss": 0.8156, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.48487222787768797, + "learning_rate": 0.00016427876096865394, + "loss": 0.7687, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.4697238382791699, + "learning_rate": 0.00016414628884613107, + "loss": 0.6664, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.5700028463426027, + "learning_rate": 0.00016401362517305296, + "loss": 0.7585, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.4105318090148504, + "learning_rate": 0.00016388077034557355, + "loss": 0.6799, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.4119685152772743, + "learning_rate": 0.00016374772476041748, + "loss": 0.7439, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4522329317929429, + "learning_rate": 0.00016361448881487914, + "loss": 0.7265, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.4765751028037275, + "learning_rate": 0.00016348106290682118, + "loss": 0.8052, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.4258110640284748, + "learning_rate": 0.00016334744743467364, + "loss": 0.7484, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.4461786810301393, + "learning_rate": 0.00016321364279743266, + "loss": 0.7671, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.5429128790669628, + "learning_rate": 0.00016307964939465914, + "loss": 0.8071, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.4390605947849205, + "learning_rate": 0.00016294546762647775, + "loss": 0.697, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4038533604127142, + "learning_rate": 0.0001628110978935756, + "loss": 0.7075, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.5568686684952407, + "learning_rate": 0.0001626765405972011, + "loss": 0.8495, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.4747017954068572, + "learning_rate": 0.00016254179613916278, + "loss": 0.7613, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4366834280182313, + "learning_rate": 0.00016240686492182804, + "loss": 0.7626, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.44999454185217175, + "learning_rate": 0.000162271747348122, + "loss": 0.7343, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.5141098953987814, + "learning_rate": 0.0001621364438215262, + "loss": 0.7381, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.4979981541763459, + "learning_rate": 0.00016200095474607753, + "loss": 0.8697, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.5435192399709737, + "learning_rate": 0.00016186528052636692, + "loss": 0.7988, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.5865212864847045, + "learning_rate": 0.0001617294215675382, + "loss": 0.8213, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4000217603254605, + "learning_rate": 0.00016159337827528685, + "loss": 0.7828, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.6411672155226577, + "learning_rate": 0.0001614571510558588, + "loss": 0.8783, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.45516267410646694, + "learning_rate": 0.00016132074031604917, + "loss": 0.7786, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.41059050816186476, + "learning_rate": 0.0001611841464632011, + "loss": 0.7115, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.46687590472932344, + "learning_rate": 0.00016104736990520468, + "loss": 0.7689, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.38822886603753526, + "learning_rate": 0.0001609104110504954, + "loss": 0.7012, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.47065876283619956, + "learning_rate": 0.0001607732703080532, + "loss": 0.7555, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.5020199385720759, + "learning_rate": 0.00016063594808740113, + "loss": 0.774, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.3921988640457385, + "learning_rate": 0.00016049844479860422, + "loss": 0.7096, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.5201087441068271, + "learning_rate": 0.00016036076085226814, + "loss": 0.8493, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.5451923737626841, + "learning_rate": 0.00016022289665953808, + "loss": 0.7868, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.47105095892126675, + "learning_rate": 0.00016008485263209742, + "loss": 0.7852, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5609582405955111, + "learning_rate": 0.0001599466291821666, + "loss": 0.6913, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.48611142261714435, + "learning_rate": 0.0001598082267225018, + "loss": 0.7384, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.4406908732434517, + "learning_rate": 0.0001596696456663938, + "loss": 0.6992, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.4589133116579723, + "learning_rate": 0.0001595308864276666, + "loss": 0.7593, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.42418996835669126, + "learning_rate": 0.00015939194942067646, + "loss": 0.7366, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.5763811465298075, + "learning_rate": 0.0001592528350603103, + "loss": 0.8606, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.424128657028291, + "learning_rate": 0.0001591135437619847, + "loss": 0.687, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.43710125300169733, + "learning_rate": 0.00015897407594164467, + "loss": 0.6815, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.46211941169802706, + "learning_rate": 0.00015883443201576225, + "loss": 0.7514, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4909993806968154, + "learning_rate": 0.0001586946124013354, + "loss": 0.7277, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.4923983192411157, + "learning_rate": 0.00015855461751588677, + "loss": 0.7655, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.4741403142399323, + "learning_rate": 0.0001584144477774623, + "loss": 0.7538, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.49141031927588963, + "learning_rate": 0.0001582741036046301, + "loss": 0.7945, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.44870531022352494, + "learning_rate": 0.00015813358541647915, + "loss": 0.7406, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.45734728130848884, + "learning_rate": 0.00015799289363261813, + "loss": 0.7482, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.49567172635911816, + "learning_rate": 0.00015785202867317407, + "loss": 0.7477, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.43475413300778126, + "learning_rate": 0.00015771099095879108, + "loss": 0.713, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.46977840374942775, + "learning_rate": 0.0001575697809106292, + "loss": 0.8047, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4463777590892542, + "learning_rate": 0.00015742839895036305, + "loss": 0.7758, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.45214582433111267, + "learning_rate": 0.00015728684550018064, + "loss": 0.7142, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.49773011149752655, + "learning_rate": 0.0001571451209827821, + "loss": 0.7274, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.4314692528617401, + "learning_rate": 0.00015700322582137827, + "loss": 0.7007, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.47206850117655724, + "learning_rate": 0.00015686116043968972, + "loss": 0.8095, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.4991695597510264, + "learning_rate": 0.00015671892526194516, + "loss": 0.7972, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3983939422798158, + "learning_rate": 0.0001565765207128805, + "loss": 0.7764, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.5050203062130337, + "learning_rate": 0.0001564339472177373, + "loss": 0.859, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.4387603838845481, + "learning_rate": 0.00015629120520226165, + "loss": 0.6947, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3964909034223212, + "learning_rate": 0.0001561482950927029, + "loss": 0.668, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.43171786252638306, + "learning_rate": 0.0001560052173158123, + "loss": 0.793, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.5187456883855971, + "learning_rate": 0.00015586197229884184, + "loss": 0.7265, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.41460633189876106, + "learning_rate": 0.00015571856046954285, + "loss": 0.7311, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5480534679771224, + "learning_rate": 0.00015557498225616487, + "loss": 0.7459, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.4353692803898675, + "learning_rate": 0.0001554312380874542, + "loss": 0.7122, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4821633256669228, + "learning_rate": 0.00015528732839265272, + "loss": 0.742, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.5037306860164165, + "learning_rate": 0.00015514325360149668, + "loss": 0.7978, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.507867334028557, + "learning_rate": 0.0001549990141442153, + "loss": 0.8045, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.5110555865166123, + "learning_rate": 0.0001548546104515294, + "loss": 0.7501, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.4073410256028646, + "learning_rate": 0.00015471004295465035, + "loss": 0.7028, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.45785855304947404, + "learning_rate": 0.0001545653120852787, + "loss": 0.7284, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.4220091163815511, + "learning_rate": 0.00015442041827560274, + "loss": 0.6307, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.4914744393206375, + "learning_rate": 0.00015427536195829742, + "loss": 0.809, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.4109261497500046, + "learning_rate": 0.00015413014356652286, + "loss": 0.6795, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5026884808116417, + "learning_rate": 0.00015398476353392323, + "loss": 0.7994, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.4541051107471657, + "learning_rate": 0.00015383922229462549, + "loss": 0.7953, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.6293001395704625, + "learning_rate": 0.00015369352028323774, + "loss": 0.8012, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4976287602137279, + "learning_rate": 0.00015354765793484834, + "loss": 0.6998, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.4955922406872679, + "learning_rate": 0.0001534016356850244, + "loss": 0.7605, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.44482542538331415, + "learning_rate": 0.0001532554539698105, + "loss": 0.7485, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4313395133652232, + "learning_rate": 0.00015310911322572753, + "loss": 0.7156, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.4311577164197288, + "learning_rate": 0.00015296261388977108, + "loss": 0.7096, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.4311431692900766, + "learning_rate": 0.0001528159563994104, + "loss": 0.7385, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.4427931426113665, + "learning_rate": 0.000152669141192587, + "loss": 0.755, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.4465549549553681, + "learning_rate": 0.00015252216870771345, + "loss": 0.6685, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.5075226293783536, + "learning_rate": 0.00015237503938367186, + "loss": 0.7675, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.5226164172149571, + "learning_rate": 0.00015222775365981273, + "loss": 0.8111, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.5139809186257368, + "learning_rate": 0.00015208031197595356, + "loss": 0.767, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.44754949926385723, + "learning_rate": 0.0001519327147723776, + "loss": 0.7254, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4438473672187995, + "learning_rate": 0.00015178496248983254, + "loss": 0.7482, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.42731239047649827, + "learning_rate": 0.0001516370555695291, + "loss": 0.7416, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.46758032555994145, + "learning_rate": 0.00015148899445313981, + "loss": 0.744, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.47426682965763883, + "learning_rate": 0.00015134077958279765, + "loss": 0.7751, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.44707733777243375, + "learning_rate": 0.00015119241140109467, + "loss": 0.7932, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.4004552113265985, + "learning_rate": 0.00015104389035108077, + "loss": 0.7307, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5467220954498396, + "learning_rate": 0.00015089521687626243, + "loss": 0.7373, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.5220563295258382, + "learning_rate": 0.0001507463914206012, + "loss": 0.8364, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.5634002615178116, + "learning_rate": 0.0001505974144285124, + "loss": 0.7415, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.38745943754068546, + "learning_rate": 0.000150448286344864, + "loss": 0.6637, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.486359386615219, + "learning_rate": 0.00015029900761497506, + "loss": 0.7836, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.5010873605466595, + "learning_rate": 0.00015014957868461458, + "loss": 0.7917, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.501288289888726, + "learning_rate": 0.00015000000000000001, + "loss": 0.6915, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.42303647740091144, + "learning_rate": 0.000149850272007796, + "loss": 0.7427, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.5633511965577845, + "learning_rate": 0.00014970039515511304, + "loss": 0.7679, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4532301301358009, + "learning_rate": 0.00014955036988950618, + "loss": 0.7322, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.5658234907452527, + "learning_rate": 0.0001494001966589736, + "loss": 0.7685, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.5118907752126777, + "learning_rate": 0.00014924987591195547, + "loss": 0.7205, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.4700921538809694, + "learning_rate": 0.00014909940809733222, + "loss": 0.735, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.3948866192176909, + "learning_rate": 0.0001489487936644237, + "loss": 0.6639, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.5029679317744075, + "learning_rate": 0.00014879803306298736, + "loss": 0.8032, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5285095766455293, + "learning_rate": 0.00014864712674321734, + "loss": 0.8266, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.4678610632791758, + "learning_rate": 0.00014849607515574276, + "loss": 0.7573, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.438079569007165, + "learning_rate": 0.00014834487875162657, + "loss": 0.7636, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.4315164695282025, + "learning_rate": 0.00014819353798236427, + "loss": 0.6787, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.402632017098526, + "learning_rate": 0.00014804205329988225, + "loss": 0.738, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.5319956953561994, + "learning_rate": 0.00014789042515653687, + "loss": 0.6921, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4671769037323997, + "learning_rate": 0.00014773865400511272, + "loss": 0.7307, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.4963743520933098, + "learning_rate": 0.00014758674029882152, + "loss": 0.8179, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.4376609890215395, + "learning_rate": 0.00014743468449130063, + "loss": 0.7613, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.42537507347010867, + "learning_rate": 0.00014728248703661182, + "loss": 0.7545, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.42779369666001926, + "learning_rate": 0.00014713014838923976, + "loss": 0.7511, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.46320255199500754, + "learning_rate": 0.00014697766900409074, + "loss": 0.7646, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.40544842990795427, + "learning_rate": 0.00014682504933649144, + "loss": 0.6486, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.43613137315290307, + "learning_rate": 0.0001466722898421873, + "loss": 0.7125, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.4516560485665258, + "learning_rate": 0.0001465193909773413, + "loss": 0.7988, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.4622059127493755, + "learning_rate": 0.00014636635319853275, + "loss": 0.8072, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.6062904742380004, + "learning_rate": 0.00014621317696275564, + "loss": 0.7711, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.4457959629263612, + "learning_rate": 0.00014605986272741748, + "loss": 0.7529, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.5036905895559952, + "learning_rate": 0.00014590641095033787, + "loss": 0.742, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.43078435343308674, + "learning_rate": 0.00014575282208974702, + "loss": 0.7135, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.4309120967334071, + "learning_rate": 0.00014559909660428468, + "loss": 0.7066, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3910644078370677, + "learning_rate": 0.00014544523495299842, + "loss": 0.6915, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.49404409459857673, + "learning_rate": 0.00014529123759534255, + "loss": 0.7782, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.3786436836018683, + "learning_rate": 0.00014513710499117647, + "loss": 0.6636, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4820612392270634, + "learning_rate": 0.0001449828376007636, + "loss": 0.727, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.49517669517339263, + "learning_rate": 0.00014482843588476974, + "loss": 0.7085, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.3894098085822135, + "learning_rate": 0.00014467390030426186, + "loss": 0.7626, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.4809366667011091, + "learning_rate": 0.0001445192313207067, + "loss": 0.7941, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.4334479277896229, + "learning_rate": 0.0001443644293959693, + "loss": 0.7546, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.411286707423272, + "learning_rate": 0.00014420949499231172, + "loss": 0.6783, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4091591056349995, + "learning_rate": 0.0001440544285723915, + "loss": 0.7366, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.4459957849244347, + "learning_rate": 0.00014389923059926062, + "loss": 0.7454, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.4340937086824739, + "learning_rate": 0.0001437439015363638, + "loss": 0.7597, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.436311728081951, + "learning_rate": 0.00014358844184753712, + "loss": 0.7509, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.4130299382442576, + "learning_rate": 0.00014343285199700683, + "loss": 0.7613, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.5002475940688763, + "learning_rate": 0.0001432771324493879, + "loss": 0.7146, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4327174731401735, + "learning_rate": 0.00014312128366968243, + "loss": 0.6813, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.5696939459056176, + "learning_rate": 0.00014296530612327863, + "loss": 0.895, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.4131339642407923, + "learning_rate": 0.00014280920027594907, + "loss": 0.7575, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.39482645237754493, + "learning_rate": 0.00014265296659384956, + "loss": 0.6859, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.46843622030627735, + "learning_rate": 0.00014249660554351752, + "loss": 0.7232, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.45430122743579016, + "learning_rate": 0.00014234011759187083, + "loss": 0.7073, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.45158387414078693, + "learning_rate": 0.00014218350320620624, + "loss": 0.7257, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.39216543786202235, + "learning_rate": 0.00014202676285419812, + "loss": 0.6368, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.4848796664385961, + "learning_rate": 0.00014186989700389687, + "loss": 0.6737, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.410014976951978, + "learning_rate": 0.0001417129061237278, + "loss": 0.6624, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.39256947308996265, + "learning_rate": 0.0001415557906824895, + "loss": 0.6836, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.4565237959087724, + "learning_rate": 0.00014139855114935252, + "loss": 0.6788, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.5259409688862253, + "learning_rate": 0.00014124118799385796, + "loss": 0.7694, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.47852330100220636, + "learning_rate": 0.0001410837016859161, + "loss": 0.7403, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.4887007053618095, + "learning_rate": 0.00014092609269580496, + "loss": 0.7099, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.43058414711812387, + "learning_rate": 0.00014076836149416887, + "loss": 0.7203, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.4454818052149887, + "learning_rate": 0.00014061050855201723, + "loss": 0.7833, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.5429261826001993, + "learning_rate": 0.0001404525343407228, + "loss": 0.7327, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.39815626556736594, + "learning_rate": 0.0001402944393320206, + "loss": 0.7461, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.39246289798886425, + "learning_rate": 0.00014013622399800627, + "loss": 0.6866, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.4741579535345787, + "learning_rate": 0.00013997788881113489, + "loss": 0.6839, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4489399709915272, + "learning_rate": 0.00013981943424421932, + "loss": 0.6996, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.4115010330663161, + "learning_rate": 0.0001396608607704289, + "loss": 0.718, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.4956119127490432, + "learning_rate": 0.0001395021688632882, + "loss": 0.7818, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4278273974928803, + "learning_rate": 0.00013934335899667527, + "loss": 0.7528, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.42786759225456455, + "learning_rate": 0.00013918443164482046, + "loss": 0.7039, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.4337138843974863, + "learning_rate": 0.000139025387282305, + "loss": 0.661, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.4237467557227778, + "learning_rate": 0.00013886622638405952, + "loss": 0.7349, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.40920424786891146, + "learning_rate": 0.0001387069494253626, + "loss": 0.691, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.3686416086717631, + "learning_rate": 0.0001385475568818394, + "loss": 0.7122, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4643581788066938, + "learning_rate": 0.00013838804922946027, + "loss": 0.7569, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.4762060081185955, + "learning_rate": 0.00013822842694453924, + "loss": 0.7806, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.5081499789746543, + "learning_rate": 0.0001380686905037327, + "loss": 0.8157, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.5130805092024211, + "learning_rate": 0.00013790884038403795, + "loss": 0.7197, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.45755645408144535, + "learning_rate": 0.00013774887706279165, + "loss": 0.7087, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.4031907778603954, + "learning_rate": 0.0001375888010176686, + "loss": 0.7051, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.5609100001570461, + "learning_rate": 0.00013742861272668012, + "loss": 0.815, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.49628776979255335, + "learning_rate": 0.00013726831266817278, + "loss": 0.7691, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.45510353095639755, + "learning_rate": 0.00013710790132082692, + "loss": 0.7281, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.47158515365956716, + "learning_rate": 0.00013694737916365517, + "loss": 0.7398, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.364542506038144, + "learning_rate": 0.00013678674667600102, + "loss": 0.6406, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.3910998335909253, + "learning_rate": 0.00013662600433753745, + "loss": 0.6849, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.5004213445389641, + "learning_rate": 0.00013646515262826552, + "loss": 0.8172, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.5649113104129985, + "learning_rate": 0.00013630419202851284, + "loss": 0.798, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.4772701337982866, + "learning_rate": 0.00013614312301893223, + "loss": 0.7384, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3911165137008045, + "learning_rate": 0.0001359819460805001, + "loss": 0.6913, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.3920118938414042, + "learning_rate": 0.00013582066169451535, + "loss": 0.7101, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.43284706717797405, + "learning_rate": 0.0001356592703425976, + "loss": 0.7517, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4571677343916639, + "learning_rate": 0.0001354977725066859, + "loss": 0.7085, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.46129653075125937, + "learning_rate": 0.00013533616866903735, + "loss": 0.7866, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.4040534320766542, + "learning_rate": 0.0001351744593122255, + "loss": 0.6709, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.48713734333464104, + "learning_rate": 0.00013501264491913906, + "loss": 0.7979, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.4422612508515027, + "learning_rate": 0.00013485072597298038, + "loss": 0.6914, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.497515567006007, + "learning_rate": 0.00013468870295726398, + "loss": 0.6701, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3749627961878381, + "learning_rate": 0.0001345265763558152, + "loss": 0.678, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.4053618178498056, + "learning_rate": 0.00013436434665276865, + "loss": 0.7526, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.4869231955669743, + "learning_rate": 0.00013420201433256689, + "loss": 0.7363, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.5113603633922611, + "learning_rate": 0.00013403957987995882, + "loss": 0.7795, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.43042627566994696, + "learning_rate": 0.00013387704377999842, + "loss": 0.7381, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.42018128767763213, + "learning_rate": 0.00013371440651804313, + "loss": 0.7372, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.45114974776976924, + "learning_rate": 0.0001335516685797525, + "loss": 0.7467, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.37166054472673127, + "learning_rate": 0.00013338883045108674, + "loss": 0.7265, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.5455089847152249, + "learning_rate": 0.00013322589261830517, + "loss": 0.8383, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.47410323882797717, + "learning_rate": 0.00013306285556796495, + "loss": 0.7555, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.3708553682655247, + "learning_rate": 0.0001328997197869194, + "loss": 0.6607, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.39716143357931005, + "learning_rate": 0.0001327364857623168, + "loss": 0.69, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4637436225554559, + "learning_rate": 0.00013257315398159864, + "loss": 0.7121, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.4646629132988441, + "learning_rate": 0.00013240972493249847, + "loss": 0.748, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.5259203569640928, + "learning_rate": 0.0001322461991030402, + "loss": 0.7625, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.505306648827023, + "learning_rate": 0.00013208257698153677, + "loss": 0.7674, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.4137847217470741, + "learning_rate": 0.00013191885905658872, + "loss": 0.696, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.49464438417860157, + "learning_rate": 0.0001317550458170826, + "loss": 0.7926, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.4242428104136327, + "learning_rate": 0.00013159113775218964, + "loss": 0.6871, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.4727755381179743, + "learning_rate": 0.00013142713535136414, + "loss": 0.771, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.43294329661718095, + "learning_rate": 0.00013126303910434214, + "loss": 0.7027, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4398267303341206, + "learning_rate": 0.00013109884950114007, + "loss": 0.808, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.5014322494464222, + "learning_rate": 0.00013093456703205288, + "loss": 0.7417, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.43561358626505575, + "learning_rate": 0.00013077019218765305, + "loss": 0.7333, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.5735756367259995, + "learning_rate": 0.00013060572545878875, + "loss": 0.7636, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.4816319454871905, + "learning_rate": 0.0001304411673365826, + "loss": 0.7922, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.4310653881420473, + "learning_rate": 0.0001302765183124302, + "loss": 0.7094, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4846420316957972, + "learning_rate": 0.00013011177887799845, + "loss": 0.7582, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.4324312494858273, + "learning_rate": 0.00012994694952522435, + "loss": 0.7419, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.5074981102326569, + "learning_rate": 0.00012978203074631334, + "loss": 0.7312, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4905776990896686, + "learning_rate": 0.00012961702303373795, + "loss": 0.7956, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.49200158234741015, + "learning_rate": 0.00012945192688023624, + "loss": 0.7989, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.5086000623801779, + "learning_rate": 0.0001292867427788104, + "loss": 0.7491, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.5133411173886876, + "learning_rate": 0.00012912147122272523, + "loss": 0.7377, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.506453431859821, + "learning_rate": 0.00012895611270550666, + "loss": 0.7692, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.35191826880004473, + "learning_rate": 0.0001287906677209403, + "loss": 0.636, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4385447355157215, + "learning_rate": 0.00012862513676307008, + "loss": 0.7413, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.5454625889989327, + "learning_rate": 0.0001284595203261965, + "loss": 0.8247, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.4749738899934725, + "learning_rate": 0.00012829381890487536, + "loss": 0.7708, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.43017137981306225, + "learning_rate": 0.00012812803299391628, + "loss": 0.6772, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.45323444813708, + "learning_rate": 0.00012796216308838117, + "loss": 0.7362, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.4185420011773539, + "learning_rate": 0.00012779620968358273, + "loss": 0.7612, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.45638549873019607, + "learning_rate": 0.00012763017327508305, + "loss": 0.722, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.3661491978523475, + "learning_rate": 0.00012746405435869198, + "loss": 0.6836, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.44714719395811886, + "learning_rate": 0.00012729785343046588, + "loss": 0.7836, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.47466575379253284, + "learning_rate": 0.0001271315709867059, + "loss": 0.7224, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.4676065851593823, + "learning_rate": 0.00012696520752395672, + "loss": 0.8039, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.4318394753620631, + "learning_rate": 0.00012679876353900482, + "loss": 0.7355, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.47774067896149053, + "learning_rate": 0.00012663223952887723, + "loss": 0.7645, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.4311480104137981, + "learning_rate": 0.00012646563599083996, + "loss": 0.769, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.410371089255066, + "learning_rate": 0.00012629895342239643, + "loss": 0.7305, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.41546251368632736, + "learning_rate": 0.00012613219232128608, + "loss": 0.6847, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.4405943194323538, + "learning_rate": 0.00012596535318548289, + "loss": 0.7464, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.4330941730814299, + "learning_rate": 0.0001257984365131938, + "loss": 0.7176, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.5736085095853767, + "learning_rate": 0.00012563144280285741, + "loss": 0.8581, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.4472813130814026, + "learning_rate": 0.00012546437255314222, + "loss": 0.7106, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.43569787512162517, + "learning_rate": 0.0001252972262629454, + "loss": 0.6917, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.49454833062512754, + "learning_rate": 0.00012513000443139112, + "loss": 0.7953, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.5181978628846783, + "learning_rate": 0.00012496270755782914, + "loss": 0.7135, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.39352616423357195, + "learning_rate": 0.00012479533614183334, + "loss": 0.6566, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4610612173739749, + "learning_rate": 0.00012462789068320017, + "loss": 0.7231, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.46913616543609205, + "learning_rate": 0.00012446037168194714, + "loss": 0.7332, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.46953178408541935, + "learning_rate": 0.00012429277963831148, + "loss": 0.7212, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.6271304886126731, + "learning_rate": 0.00012412511505274844, + "loss": 0.8707, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.46209248966127764, + "learning_rate": 0.00012395737842592995, + "loss": 0.7695, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.47942304215077186, + "learning_rate": 0.000123789570258743, + "loss": 0.8565, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4101241980577303, + "learning_rate": 0.00012362169105228826, + "loss": 0.6485, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.3816718730312766, + "learning_rate": 0.00012345374130787854, + "loss": 0.6256, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.4417193001861336, + "learning_rate": 0.00012328572152703725, + "loss": 0.7207, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.5981440375482587, + "learning_rate": 0.000123117632211497, + "loss": 0.7957, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.4647444456843219, + "learning_rate": 0.00012294947386319794, + "loss": 0.7351, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.40296395195083007, + "learning_rate": 0.0001227812469842864, + "loss": 0.7092, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.402563489305275, + "learning_rate": 0.00012261295207711346, + "loss": 0.6711, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.5106001320237675, + "learning_rate": 0.00012244458964423327, + "loss": 0.7049, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.4237461312350529, + "learning_rate": 0.00012227616018840154, + "loss": 0.7332, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4558093727384499, + "learning_rate": 0.0001221076642125742, + "loss": 0.7274, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.5204878320075624, + "learning_rate": 0.00012193910221990581, + "loss": 0.6482, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.4889693265863364, + "learning_rate": 0.00012177047471374807, + "loss": 0.7185, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.4540591387797124, + "learning_rate": 0.00012160178219764837, + "loss": 0.746, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.46190022087141114, + "learning_rate": 0.0001214330251753481, + "loss": 0.6699, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.46435930782938306, + "learning_rate": 0.00012126420415078132, + "loss": 0.7373, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.385266465778187, + "learning_rate": 0.00012109531962807332, + "loss": 0.6748, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.43843292366862363, + "learning_rate": 0.00012092637211153885, + "loss": 0.6912, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.5162492799710219, + "learning_rate": 0.0001207573621056809, + "loss": 0.7186, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4043800478325061, + "learning_rate": 0.00012058829011518896, + "loss": 0.7167, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.5791722782202059, + "learning_rate": 0.00012041915664493761, + "loss": 0.7936, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.4128245865326509, + "learning_rate": 0.00012024996219998517, + "loss": 0.7243, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.4768187998001638, + "learning_rate": 0.00012008070728557186, + "loss": 0.7161, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.3975473520741925, + "learning_rate": 0.00011991139240711857, + "loss": 0.7131, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.4369425936837535, + "learning_rate": 0.00011974201807022525, + "loss": 0.7068, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4348740984844573, + "learning_rate": 0.00011957258478066931, + "loss": 0.7476, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.3751438595107849, + "learning_rate": 0.00011940309304440433, + "loss": 0.673, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.45681190956328344, + "learning_rate": 0.00011923354336755835, + "loss": 0.6976, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.4595733453115094, + "learning_rate": 0.00011906393625643244, + "loss": 0.6722, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.4473268550932148, + "learning_rate": 0.00011889427221749916, + "loss": 0.7311, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.4142221153102251, + "learning_rate": 0.00011872455175740112, + "loss": 0.7096, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.36759043619548204, + "learning_rate": 0.00011855477538294935, + "loss": 0.7007, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.4520956488770485, + "learning_rate": 0.00011838494360112185, + "loss": 0.7029, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.46071778297359106, + "learning_rate": 0.00011821505691906216, + "loss": 0.7405, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.42962898937413113, + "learning_rate": 0.00011804511584407763, + "loss": 0.7513, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.4562043519029368, + "learning_rate": 0.00011787512088363817, + "loss": 0.7418, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.43526330928681223, + "learning_rate": 0.00011770507254537453, + "loss": 0.7218, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.41079236934528096, + "learning_rate": 0.00011753497133707679, + "loss": 0.6617, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.4603211357292951, + "learning_rate": 0.00011736481776669306, + "loss": 0.7396, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.4160488570454149, + "learning_rate": 0.00011719461234232764, + "loss": 0.6784, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.49332789032248425, + "learning_rate": 0.00011702435557223987, + "loss": 0.6971, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.3959679849224477, + "learning_rate": 0.00011685404796484225, + "loss": 0.6869, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.39101799382598734, + "learning_rate": 0.00011668369002869912, + "loss": 0.6586, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.3843654891183163, + "learning_rate": 0.00011651328227252517, + "loss": 0.716, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.6167233516284031, + "learning_rate": 0.00011634282520518383, + "loss": 0.744, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.4667836434695035, + "learning_rate": 0.00011617231933568578, + "loss": 0.7053, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4307218486114171, + "learning_rate": 0.00011600176517318741, + "loss": 0.7375, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.41892455337672124, + "learning_rate": 0.00011583116322698935, + "loss": 0.7204, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.4105597984237018, + "learning_rate": 0.00011566051400653486, + "loss": 0.6541, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4218351688180284, + "learning_rate": 0.00011548981802140848, + "loss": 0.7335, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.45366576883717097, + "learning_rate": 0.00011531907578133429, + "loss": 0.7092, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.4667696614038426, + "learning_rate": 0.00011514828779617459, + "loss": 0.7406, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.5425681188399039, + "learning_rate": 0.00011497745457592816, + "loss": 0.7437, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.38535317471221614, + "learning_rate": 0.00011480657663072896, + "loss": 0.6716, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.4192535492707317, + "learning_rate": 0.00011463565447084445, + "loss": 0.7233, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.40820513367792977, + "learning_rate": 0.00011446468860667421, + "loss": 0.6468, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.47887645513809207, + "learning_rate": 0.00011429367954874819, + "loss": 0.7356, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.48708392333779144, + "learning_rate": 0.0001141226278077254, + "loss": 0.7668, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.44890150234734566, + "learning_rate": 0.00011395153389439233, + "loss": 0.714, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.4033643738540016, + "learning_rate": 0.00011378039831966134, + "loss": 0.7026, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.5460244851697473, + "learning_rate": 0.00011360922159456928, + "loss": 0.6596, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4317809805820681, + "learning_rate": 0.00011343800423027582, + "loss": 0.6896, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.47426607526559306, + "learning_rate": 0.00011326674673806195, + "loss": 0.7394, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.4985689491133637, + "learning_rate": 0.00011309544962932862, + "loss": 0.8086, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4549172063415435, + "learning_rate": 0.0001129241134155949, + "loss": 0.7761, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.4266860875624114, + "learning_rate": 0.00011275273860849684, + "loss": 0.734, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.507563664980997, + "learning_rate": 0.00011258132571978555, + "loss": 0.6982, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.5282820119941769, + "learning_rate": 0.00011240987526132594, + "loss": 0.7491, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.40503093114604405, + "learning_rate": 0.00011223838774509514, + "loss": 0.7451, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.488867826119295, + "learning_rate": 0.00011206686368318086, + "loss": 0.641, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.40976055576829357, + "learning_rate": 0.00011189530358778005, + "loss": 0.634, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.4552955303343864, + "learning_rate": 0.00011172370797119712, + "loss": 0.7287, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.4503466266751716, + "learning_rate": 0.00011155207734584263, + "loss": 0.7387, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.3981204039595053, + "learning_rate": 0.00011138041222423177, + "loss": 0.7428, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.48300892687341584, + "learning_rate": 0.00011120871311898254, + "loss": 0.7706, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.6114796805270047, + "learning_rate": 0.0001110369805428146, + "loss": 0.7725, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.47829190947630557, + "learning_rate": 0.00011086521500854745, + "loss": 0.7122, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.47509235009328055, + "learning_rate": 0.0001106934170290991, + "loss": 0.7474, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.4163568295788633, + "learning_rate": 0.00011052158711748434, + "loss": 0.6821, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.47479401809599403, + "learning_rate": 0.00011034972578681338, + "loss": 0.7569, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.5258693091671406, + "learning_rate": 0.00011017783355029026, + "loss": 0.7352, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.3963908329305212, + "learning_rate": 0.00011000591092121127, + "loss": 0.7156, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.3992466986292596, + "learning_rate": 0.00010983395841296348, + "loss": 0.7247, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.5036668467046255, + "learning_rate": 0.0001096619765390232, + "loss": 0.8379, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.5227935206101292, + "learning_rate": 0.00010948996581295436, + "loss": 0.7455, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3897132094791309, + "learning_rate": 0.00010931792674840718, + "loss": 0.7122, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.4437971490014331, + "learning_rate": 0.00010914585985911632, + "loss": 0.6656, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.4402770228498105, + "learning_rate": 0.00010897376565889971, + "loss": 0.7002, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.3981184649726954, + "learning_rate": 0.00010880164466165674, + "loss": 0.7065, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.38734965456860326, + "learning_rate": 0.00010862949738136681, + "loss": 0.671, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.45967307493813275, + "learning_rate": 0.00010845732433208779, + "loss": 0.7007, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4415502638993889, + "learning_rate": 0.00010828512602795462, + "loss": 0.8262, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.3852510268950323, + "learning_rate": 0.00010811290298317755, + "loss": 0.6025, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.43729735653926355, + "learning_rate": 0.00010794065571204072, + "loss": 0.6736, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.483931726724541, + "learning_rate": 0.00010776838472890065, + "loss": 0.7594, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.4711825376788208, + "learning_rate": 0.00010759609054818458, + "loss": 0.7427, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.4158881135613916, + "learning_rate": 0.00010742377368438914, + "loss": 0.6529, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4229997503304087, + "learning_rate": 0.00010725143465207867, + "loss": 0.6851, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.41529759982962977, + "learning_rate": 0.00010707907396588361, + "loss": 0.6806, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.3823254705941637, + "learning_rate": 0.0001069066921404992, + "loss": 0.6424, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5036785948104029, + "learning_rate": 0.00010673428969068364, + "loss": 0.7438, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.4657051853486107, + "learning_rate": 0.00010656186713125689, + "loss": 0.7515, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.3659442262962532, + "learning_rate": 0.0001063894249770989, + "loss": 0.6756, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.4763932272432071, + "learning_rate": 0.00010621696374314807, + "loss": 0.6691, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.422659681385368, + "learning_rate": 0.00010604448394439983, + "loss": 0.7436, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.45119229899083296, + "learning_rate": 0.00010587198609590505, + "loss": 0.7411, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3813081513393947, + "learning_rate": 0.00010569947071276847, + "loss": 0.6633, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.4049726948935483, + "learning_rate": 0.00010552693831014726, + "loss": 0.6458, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.47243820951738996, + "learning_rate": 0.0001053543894032493, + "loss": 0.7511, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4278506901664067, + "learning_rate": 0.00010518182450733186, + "loss": 0.6735, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.40469358253849136, + "learning_rate": 0.00010500924413769988, + "loss": 0.6765, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.5868985156349849, + "learning_rate": 0.00010483664880970457, + "loss": 0.763, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.48585845000143807, + "learning_rate": 0.00010466403903874176, + "loss": 0.7767, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.41423105216125516, + "learning_rate": 0.00010449141534025045, + "loss": 0.7211, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.46837065093491187, + "learning_rate": 0.00010431877822971117, + "loss": 0.7128, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.549481629853275, + "learning_rate": 0.00010414612822264455, + "loss": 0.8253, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.4633116098923696, + "learning_rate": 0.00010397346583460971, + "loss": 0.785, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.3943224231160869, + "learning_rate": 0.0001038007915812028, + "loss": 0.7156, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.44379164362150214, + "learning_rate": 0.00010362810597805526, + "loss": 0.6587, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.3626841490388854, + "learning_rate": 0.0001034554095408326, + "loss": 0.6571, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.4436091443937909, + "learning_rate": 0.00010328270278523256, + "loss": 0.7152, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.43650784153712674, + "learning_rate": 0.0001031099862269837, + "loss": 0.7308, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.4203241445174268, + "learning_rate": 0.00010293726038184393, + "loss": 0.7165, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.4930095807984508, + "learning_rate": 0.00010276452576559879, + "loss": 0.7946, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3928223631421178, + "learning_rate": 0.00010259178289406011, + "loss": 0.614, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.4388642643374217, + "learning_rate": 0.00010241903228306431, + "loss": 0.7225, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.44617337236204435, + "learning_rate": 0.0001022462744484709, + "loss": 0.6776, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4251188578699655, + "learning_rate": 0.00010207350990616107, + "loss": 0.6613, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.36939164932819946, + "learning_rate": 0.00010190073917203589, + "loss": 0.6789, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.37730336076580073, + "learning_rate": 0.00010172796276201503, + "loss": 0.6928, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.4311008595792593, + "learning_rate": 0.0001015551811920351, + "loss": 0.7407, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.40750017747843514, + "learning_rate": 0.00010138239497804804, + "loss": 0.687, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.5241761573136788, + "learning_rate": 0.00010120960463601976, + "loss": 0.756, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.5111238515251669, + "learning_rate": 0.00010103681068192845, + "loss": 0.7308, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.39767583558950237, + "learning_rate": 0.00010086401363176305, + "loss": 0.6527, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.4565231247409654, + "learning_rate": 0.00010069121400152181, + "loss": 0.698, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.41515013887448426, + "learning_rate": 0.00010051841230721065, + "loss": 0.6635, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.4190816416159948, + "learning_rate": 0.0001003456090648416, + "loss": 0.6625, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.4079093991482028, + "learning_rate": 0.00010017280479043147, + "loss": 0.6897, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.431786861547457, + "learning_rate": 0.0001, + "loss": 0.727, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.4289266535551944, + "learning_rate": 9.982719520956855e-05, + "loss": 0.6796, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.37447856671063556, + "learning_rate": 9.965439093515841e-05, + "loss": 0.675, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.5552197028303565, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7469, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.4798313893553535, + "learning_rate": 9.930878599847821e-05, + "loss": 0.7336, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.43961855872777655, + "learning_rate": 9.913598636823693e-05, + "loss": 0.7894, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.43490323371471584, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6511, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.3619713521872154, + "learning_rate": 9.879039536398024e-05, + "loss": 0.6846, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.3763967755843891, + "learning_rate": 9.861760502195197e-05, + "loss": 0.6752, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.6151090094687006, + "learning_rate": 9.844481880796491e-05, + "loss": 0.79, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.47605106200301756, + "learning_rate": 9.827203723798498e-05, + "loss": 0.7487, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.43782978426625524, + "learning_rate": 9.809926082796415e-05, + "loss": 0.7237, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6640548217093781, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7742, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.43626631149333445, + "learning_rate": 9.775372555152912e-05, + "loss": 0.7686, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.4356100076927952, + "learning_rate": 9.758096771693573e-05, + "loss": 0.6895, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.49130526928805984, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6991, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.41749768320352587, + "learning_rate": 9.723547423440122e-05, + "loss": 0.6479, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.45570962951682303, + "learning_rate": 9.70627396181561e-05, + "loss": 0.7182, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4760911403014369, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7697, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.41078163296557063, + "learning_rate": 9.671729721476746e-05, + "loss": 0.6836, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.4381182811100299, + "learning_rate": 9.654459045916743e-05, + "loss": 0.6954, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4264639388270093, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7135, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.46443605284534845, + "learning_rate": 9.619920841879725e-05, + "loss": 0.7778, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.42285933591883057, + "learning_rate": 9.602653416539031e-05, + "loss": 0.7208, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.44579506138805214, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7628, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.39990901331092055, + "learning_rate": 9.568122177028884e-05, + "loss": 0.6565, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.4368271543482817, + "learning_rate": 9.550858465974958e-05, + "loss": 0.7688, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.37718811129992086, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6496, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.47708446838070123, + "learning_rate": 9.516335119029546e-05, + "loss": 0.7916, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.45465817813415427, + "learning_rate": 9.499075586230013e-05, + "loss": 0.7285, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.46190590584113933, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7116, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.44740890183548143, + "learning_rate": 9.464561059675073e-05, + "loss": 0.68, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.40837738818897246, + "learning_rate": 9.44730616898528e-05, + "loss": 0.66, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4393471858313049, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6862, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4110038871541141, + "learning_rate": 9.412801390409497e-05, + "loss": 0.7181, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.3920397012663661, + "learning_rate": 9.395551605560018e-05, + "loss": 0.6742, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4658582980896299, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6823, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.5644670189980434, + "learning_rate": 9.361057502290113e-05, + "loss": 0.7212, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.5101367429450542, + "learning_rate": 9.343813286874312e-05, + "loss": 0.7766, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.5466866564805702, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7976, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.4940172741215113, + "learning_rate": 9.309330785950086e-05, + "loss": 0.6682, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.4345453586038417, + "learning_rate": 9.292092603411641e-05, + "loss": 0.7241, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.478324668709217, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7239, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.45033807864970016, + "learning_rate": 9.257622631561085e-05, + "loss": 0.6897, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.5348174689129647, + "learning_rate": 9.240390945181543e-05, + "loss": 0.6957, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3823896966212316, + "learning_rate": 9.223161527109937e-05, + "loss": 0.715, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.46073164939223454, + "learning_rate": 9.205934428795929e-05, + "loss": 0.6939, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.4801510110366479, + "learning_rate": 9.188709701682247e-05, + "loss": 0.7548, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4728619897015319, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7014, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.4218190375002509, + "learning_rate": 9.154267566791223e-05, + "loss": 0.6732, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.5422609846004213, + "learning_rate": 9.137050261863324e-05, + "loss": 0.7532, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.39433382681198076, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6575, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.42359330314860427, + "learning_rate": 9.102623434110028e-05, + "loss": 0.6779, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.44146456227553116, + "learning_rate": 9.085414014088369e-05, + "loss": 0.6854, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.47167580768786777, + "learning_rate": 9.068207325159284e-05, + "loss": 0.653, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.45133233237427317, + "learning_rate": 9.051003418704565e-05, + "loss": 0.7409, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.4201773986645445, + "learning_rate": 9.033802346097682e-05, + "loss": 0.7056, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.4023223296247732, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7154, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.4692166344755956, + "learning_rate": 8.999408907878877e-05, + "loss": 0.6946, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.44277364227217714, + "learning_rate": 8.982216644970979e-05, + "loss": 0.7222, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.480199163868813, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7443, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.5001288292413456, + "learning_rate": 8.947841288251568e-05, + "loss": 0.7551, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.5044486579782337, + "learning_rate": 8.930658297090091e-05, + "loss": 0.8154, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.5511924644753977, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7777, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.406075888614496, + "learning_rate": 8.896301945718541e-05, + "loss": 0.691, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.4101635069428767, + "learning_rate": 8.879128688101749e-05, + "loss": 0.6752, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4353275594534333, + "learning_rate": 8.861958777576827e-05, + "loss": 0.705, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.5381104154274325, + "learning_rate": 8.844792265415738e-05, + "loss": 0.7253, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.3675840727089868, + "learning_rate": 8.827629202880293e-05, + "loss": 0.7126, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.6471266189518096, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7445, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.49705826944673037, + "learning_rate": 8.793313631681915e-05, + "loss": 0.7579, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.3810241330113108, + "learning_rate": 8.776161225490489e-05, + "loss": 0.6612, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.41067557306498564, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7437, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.3699384289898883, + "learning_rate": 8.741867428021446e-05, + "loss": 0.6543, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.43438727622729145, + "learning_rate": 8.724726139150318e-05, + "loss": 0.6551, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.41481861561299377, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7064, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.415666985443102, + "learning_rate": 8.690455037067141e-05, + "loss": 0.6883, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.6092680663548886, + "learning_rate": 8.673325326193806e-05, + "loss": 0.7205, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5012608718252068, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6799, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.5032541903633672, + "learning_rate": 8.639077840543077e-05, + "loss": 0.7814, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.41741920545163397, + "learning_rate": 8.621960168033867e-05, + "loss": 0.7096, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.46294385164549157, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7573, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.4904451208041044, + "learning_rate": 8.587737219227462e-05, + "loss": 0.7633, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.5795592645359291, + "learning_rate": 8.570632045125185e-05, + "loss": 0.7309, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.3815941219656394, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6547, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.41043655870937906, + "learning_rate": 8.536434552915556e-05, + "loss": 0.7217, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.48331976809373395, + "learning_rate": 8.519342336927105e-05, + "loss": 0.7879, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.40530779548903756, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6537, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.4275792943736877, + "learning_rate": 8.485171220382545e-05, + "loss": 0.698, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.5413070152254348, + "learning_rate": 8.468092421866573e-05, + "loss": 0.7096, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.41309864800045915, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7467, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.6646512903066634, + "learning_rate": 8.433948599346516e-05, + "loss": 0.6962, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.3871055361932072, + "learning_rate": 8.416883677301069e-05, + "loss": 0.6916, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.4483959435891016, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6676, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.453937128181304, + "learning_rate": 8.382768066431425e-05, + "loss": 0.7195, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.38165176510745086, + "learning_rate": 8.36571747948162e-05, + "loss": 0.6322, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4581503378797954, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6548, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.5391863734865159, + "learning_rate": 8.33163099713009e-05, + "loss": 0.7539, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.3754003717719721, + "learning_rate": 8.31459520351578e-05, + "loss": 0.6789, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.4972463056027732, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6535, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.42390724311570416, + "learning_rate": 8.280538765767235e-05, + "loss": 0.6328, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.675660727677251, + "learning_rate": 8.263518223330697e-05, + "loss": 0.652, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.6435985943801711, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7919, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.5006136042376417, + "learning_rate": 8.22949274546255e-05, + "loss": 0.723, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.39625902620982, + "learning_rate": 8.212487911636184e-05, + "loss": 0.6576, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4545922958527993, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7205, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.4495068479338101, + "learning_rate": 8.178494308093789e-05, + "loss": 0.7018, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.4709380463003231, + "learning_rate": 8.161505639887817e-05, + "loss": 0.6701, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4628075359403977, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6762, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.5041511456595938, + "learning_rate": 8.127544824259889e-05, + "loss": 0.6889, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.4840293659189363, + "learning_rate": 8.110572778250085e-05, + "loss": 0.7337, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.48667160173127905, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6733, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.4617452426260254, + "learning_rate": 8.076645663244168e-05, + "loss": 0.6993, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.45671699472528243, + "learning_rate": 8.059690695559568e-05, + "loss": 0.7137, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.3916027221909562, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6407, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.45910141535244414, + "learning_rate": 8.025798192977481e-05, + "loss": 0.7296, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.4819780361829293, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6869, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.44847446180169626, + "learning_rate": 7.991929271442817e-05, + "loss": 0.647, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.43917620108531935, + "learning_rate": 7.975003780001485e-05, + "loss": 0.7588, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.5162715358080738, + "learning_rate": 7.958084335506239e-05, + "loss": 0.8084, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.472443797093647, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6519, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.4983222958473055, + "learning_rate": 7.924263789431912e-05, + "loss": 0.7586, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.3696287349960062, + "learning_rate": 7.907362788846116e-05, + "loss": 0.7146, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.38460780298635355, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6523, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.40394723209746247, + "learning_rate": 7.873579584921869e-05, + "loss": 0.7032, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.40004900093478846, + "learning_rate": 7.856697482465196e-05, + "loss": 0.6215, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.40870489691697814, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6469, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.4377794759908524, + "learning_rate": 7.822952528625191e-05, + "loss": 0.6999, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.41096064783451314, + "learning_rate": 7.806089778009421e-05, + "loss": 0.6945, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.5247766871411668, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7336, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.3913091904296901, + "learning_rate": 7.772383981159849e-05, + "loss": 0.6639, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.4321583957999197, + "learning_rate": 7.755541035576677e-05, + "loss": 0.6763, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4265285273317709, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7191, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.42939990938106737, + "learning_rate": 7.721875301571359e-05, + "loss": 0.6835, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.4380068744737683, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6991, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.4562897063384828, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6915, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.44342254603261516, + "learning_rate": 7.671427847296275e-05, + "loss": 0.6327, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.400996985089008, + "learning_rate": 7.654625869212146e-05, + "loss": 0.6373, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4241732013273108, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6915, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.4533395746973437, + "learning_rate": 7.6210429741257e-05, + "loss": 0.6822, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.4123774386387129, + "learning_rate": 7.604262157407007e-05, + "loss": 0.6568, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3893319046364079, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7115, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.4211570834760039, + "learning_rate": 7.570722036168854e-05, + "loss": 0.6263, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.3956464152384994, + "learning_rate": 7.55396283180529e-05, + "loss": 0.6815, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.4130677399041131, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6834, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.43691215060528366, + "learning_rate": 7.520466385816671e-05, + "loss": 0.7108, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.4604241063294278, + "learning_rate": 7.503729244217086e-05, + "loss": 0.722, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3826368858897757, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6454, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.4449266192184151, + "learning_rate": 7.470277373705461e-05, + "loss": 0.6153, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.4495658094813891, + "learning_rate": 7.453562744685778e-05, + "loss": 0.6583, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.44963815352240066, + "learning_rate": 7.43685571971426e-05, + "loss": 0.727, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.43131840431294677, + "learning_rate": 7.42015634868062e-05, + "loss": 0.688, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.3733896381579521, + "learning_rate": 7.403464681451715e-05, + "loss": 0.6977, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.5175259996500226, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6559, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.43135933793781067, + "learning_rate": 7.370104657760361e-05, + "loss": 0.6584, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.4409186145412831, + "learning_rate": 7.353436400916004e-05, + "loss": 0.7349, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4236489251890136, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6436, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.41546574374402656, + "learning_rate": 7.320123646099519e-05, + "loss": 0.6781, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.4234569524859422, + "learning_rate": 7.303479247604332e-05, + "loss": 0.71, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.5272396751585382, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7429, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.580902171591594, + "learning_rate": 7.270214656953415e-05, + "loss": 0.7402, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.4777516255620962, + "learning_rate": 7.253594564130804e-05, + "loss": 0.7329, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5089033422117083, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6889, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.45600715463903174, + "learning_rate": 7.22037903164173e-05, + "loss": 0.6518, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.4294783096789488, + "learning_rate": 7.203783691161883e-05, + "loss": 0.7051, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4576375770301386, + "learning_rate": 7.187196700608373e-05, + "loss": 0.693, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.5066575878603606, + "learning_rate": 7.170618109512465e-05, + "loss": 0.7335, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.43331802255903185, + "learning_rate": 7.154047967380354e-05, + "loss": 0.7554, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.48686219681220616, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7369, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.446045834246583, + "learning_rate": 7.12093322790597e-05, + "loss": 0.6921, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.41500722997459405, + "learning_rate": 7.104388729449338e-05, + "loss": 0.6977, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5460447655345045, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7253, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.4023608330688251, + "learning_rate": 7.071325722118963e-05, + "loss": 0.6058, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.41756688712429046, + "learning_rate": 7.054807311976379e-05, + "loss": 0.6962, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.4063462226629591, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6942, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.4070642991216119, + "learning_rate": 7.021796925368667e-05, + "loss": 0.6769, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.4268586750200847, + "learning_rate": 7.005305047477566e-05, + "loss": 0.7499, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.387444132229861, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6531, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.49725188955591815, + "learning_rate": 6.972348168756983e-05, + "loss": 0.6627, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.4436536946783928, + "learning_rate": 6.955883266341741e-05, + "loss": 0.684, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4885179213576605, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7008, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.4381017756174437, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7327, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.44024311320657594, + "learning_rate": 6.906543296794714e-05, + "loss": 0.6876, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.47779912093946264, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6258, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.46478338177148526, + "learning_rate": 6.873696089565786e-05, + "loss": 0.6318, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.3871467694683644, + "learning_rate": 6.85728646486359e-05, + "loss": 0.6567, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.5101724117368083, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7032, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.40078174085852053, + "learning_rate": 6.82449541829174e-05, + "loss": 0.6403, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.4719771558727841, + "learning_rate": 6.80811409434113e-05, + "loss": 0.6991, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.4660594729728226, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6738, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.5176869958498652, + "learning_rate": 6.775380089695986e-05, + "loss": 0.7341, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.4136478069853565, + "learning_rate": 6.759027506750158e-05, + "loss": 0.7043, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.6700799476808592, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7718, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.5370758797710055, + "learning_rate": 6.726351423768322e-05, + "loss": 0.7118, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.3962294485767609, + "learning_rate": 6.710028021308061e-05, + "loss": 0.6653, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4125661428087789, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6779, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.47755432152361943, + "learning_rate": 6.677410738169485e-05, + "loss": 0.7293, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.4738519129094318, + "learning_rate": 6.661116954891328e-05, + "loss": 0.7, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.37907866317871847, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6715, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.39135369922972446, + "learning_rate": 6.62855934819569e-05, + "loss": 0.6745, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.4464862790280168, + "learning_rate": 6.612295622000162e-05, + "loss": 0.6871, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.350240959730002, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6546, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.41028407238467257, + "learning_rate": 6.579798566743314e-05, + "loss": 0.6545, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.40466767126694686, + "learning_rate": 6.563565334723134e-05, + "loss": 0.6674, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.447732905913209, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7267, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.5026122896326217, + "learning_rate": 6.531129704273604e-05, + "loss": 0.7362, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.39518768955958067, + "learning_rate": 6.514927402701964e-05, + "loss": 0.6672, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4753065981760208, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7225, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.3898190332927234, + "learning_rate": 6.48255406877745e-05, + "loss": 0.6345, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.39211498039162146, + "learning_rate": 6.466383133096267e-05, + "loss": 0.6628, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3737364169920672, + "learning_rate": 6.450222749331414e-05, + "loss": 0.656, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.5663306152299664, + "learning_rate": 6.434072965740242e-05, + "loss": 0.6456, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.3959433909405213, + "learning_rate": 6.417933830548467e-05, + "loss": 0.707, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.33966680414474154, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6116, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.5014488892121401, + "learning_rate": 6.385687698106781e-05, + "loss": 0.7801, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.5480557671766723, + "learning_rate": 6.369580797148718e-05, + "loss": 0.6983, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4305705098423576, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6718, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.5638322507392332, + "learning_rate": 6.337399566246257e-05, + "loss": 0.7683, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.45442711692530413, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6978, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.43412941480796136, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7614, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.3814499211124445, + "learning_rate": 6.289209867917312e-05, + "loss": 0.656, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.4240228713220223, + "learning_rate": 6.273168733182722e-05, + "loss": 0.7085, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.43844576076097735, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7193, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.40423358488133027, + "learning_rate": 6.241119898233144e-05, + "loss": 0.7132, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.45430769254372433, + "learning_rate": 6.225112293720836e-05, + "loss": 0.7051, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.38735075879438313, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6902, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.4476681106012167, + "learning_rate": 6.19313094962673e-05, + "loss": 0.6721, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.42450882043204863, + "learning_rate": 6.177157305546078e-05, + "loss": 0.6489, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4759730254749979, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7535, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.4478631755841564, + "learning_rate": 6.145244311816063e-05, + "loss": 0.7192, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.40612761287808763, + "learning_rate": 6.129305057463741e-05, + "loss": 0.6223, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.466293785073214, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7579, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.3831884127623581, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6691, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.41725252769187654, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.7244, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.4410592672061088, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7287, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.530556815181464, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.7611, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.48591244087808333, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.6385, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4193839412184438, + "learning_rate": 6.018056575578075e-05, + "loss": 0.689, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.4214027254503024, + "learning_rate": 6.002211118886514e-05, + "loss": 0.7114, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.42500594343946496, + "learning_rate": 5.986377600199371e-05, + "loss": 0.673, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.40163503630139474, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6312, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.4116961444002072, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.6433, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.5148769388922457, + "learning_rate": 5.938949144798279e-05, + "loss": 0.7168, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3636014962202317, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6258, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.4428307925836097, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6607, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.5207998952605079, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.7153, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3870667355173029, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6459, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.39873895800873504, + "learning_rate": 5.860144885064751e-05, + "loss": 0.6063, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.4269796326987652, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.659, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.40107403756608534, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6843, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.47558164818682663, + "learning_rate": 5.813010299610313e-05, + "loss": 0.7933, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.4164460670334844, + "learning_rate": 5.797323714580192e-05, + "loss": 0.6681, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4133344695022582, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6839, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.43040993213510487, + "learning_rate": 5.765988240812921e-05, + "loss": 0.704, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.49518477477559136, + "learning_rate": 5.750339445648252e-05, + "loss": 0.7085, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.4850571750526892, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6833, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.3987796390002741, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.6622, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.4755184923128712, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.719, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.45590352150965446, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7036, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.43236767622934885, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.702, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.4025775660821899, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.6568, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.4541628654123149, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6244, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.453429834217993, + "learning_rate": 5.625609846363622e-05, + "loss": 0.6979, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.43851463617889785, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.6896, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.6467991129526891, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.768, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.6128183852221792, + "learning_rate": 5.579050500768836e-05, + "loss": 0.685, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.5330439921196597, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.7423, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4176249851405337, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7399, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.4868808485395327, + "learning_rate": 5.53260996957381e-05, + "loss": 0.7761, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.44448990010738193, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.6793, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3831915951362028, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6671, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.48647800046584916, + "learning_rate": 5.486289500882355e-05, + "loss": 0.6366, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.5028292794485312, + "learning_rate": 5.47087624046575e-05, + "loss": 0.6782, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3969565263432366, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6453, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.5597339694128612, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.7484, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.4407762969942455, + "learning_rate": 5.424717791025302e-05, + "loss": 0.7436, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4701653531005452, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.5762, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.4905199446737774, + "learning_rate": 5.394013727258254e-05, + "loss": 0.7413, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.4323538996619484, + "learning_rate": 5.378682303724435e-05, + "loss": 0.6507, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.41336776735073155, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6721, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.3865817908859722, + "learning_rate": 5.348060902265871e-05, + "loss": 0.6448, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.40584853488622363, + "learning_rate": 5.332771015781275e-05, + "loss": 0.6629, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.40284245486592835, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6846, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.5105925650064085, + "learning_rate": 5.302233099590928e-05, + "loss": 0.6974, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.48321533653328735, + "learning_rate": 5.286985161076029e-05, + "loss": 0.6283, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.43708617207234074, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6495, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.44014677243922035, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.6814, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.4711462895209837, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6951, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4161264510871055, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6857, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.4775308524055948, + "learning_rate": 5.210957484346314e-05, + "loss": 0.6804, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.5808846635377214, + "learning_rate": 5.195794670011776e-05, + "loss": 0.7775, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.44018239694044253, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7215, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.44483930698309265, + "learning_rate": 5.165512124837344e-05, + "loss": 0.6953, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.5334705549207137, + "learning_rate": 5.150392484425728e-05, + "loss": 0.6846, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.4635334204863553, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7009, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.48990461615390385, + "learning_rate": 5.120196693701267e-05, + "loss": 0.7134, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.4650288660872971, + "learning_rate": 5.105120633557634e-05, + "loss": 0.6391, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.5077059699948259, + "learning_rate": 5.090059190266779e-05, + "loss": 0.725, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.5310166258131124, + "learning_rate": 5.075012408804458e-05, + "loss": 0.7759, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.5158015698100694, + "learning_rate": 5.059980334102637e-05, + "loss": 0.6699, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.45919070240719934, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7224, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.40816446901171416, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.6087, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.418617164731529, + "learning_rate": 5.014972799220403e-05, + "loss": 0.7321, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.3971450135224963, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6935, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.41857208990235506, + "learning_rate": 4.985042131538545e-05, + "loss": 0.7104, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.43524793559071867, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.7494, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.47499544549078676, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6974, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.3653990388932472, + "learning_rate": 4.940258557148765e-05, + "loss": 0.6517, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.3826926723394425, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.6352, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.47546251831972147, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.8002, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.46996949758067974, + "learning_rate": 4.895610964891923e-05, + "loss": 0.701, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.4397199542107964, + "learning_rate": 4.880758859890536e-05, + "loss": 0.6772, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.40995388485555545, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6291, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.4292507699261795, + "learning_rate": 4.851100554686021e-05, + "loss": 0.7056, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.3959481857025995, + "learning_rate": 4.836294443047088e-05, + "loss": 0.6909, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.5011243027705662, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7071, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.5264871079596701, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.7536, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.4196101521504747, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6729, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4701477446986873, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7349, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.45922516003781183, + "learning_rate": 4.762496061632814e-05, + "loss": 0.6913, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.5222404404840132, + "learning_rate": 4.747783129228656e-05, + "loss": 0.6479, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.41816096701873523, + "learning_rate": 4.733085880741301e-05, + "loss": 0.5806, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.487397058916506, + "learning_rate": 4.718404360058966e-05, + "loss": 0.6774, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.43079324609284403, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.6776, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.46102733535716617, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6919, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.3953396431957988, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.6575, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.39399303788299483, + "learning_rate": 4.659836431497563e-05, + "loss": 0.6213, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.3759089877093692, + "learning_rate": 4.645234206515171e-05, + "loss": 0.621, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.40925533431762995, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6247, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.505159326387147, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.7192, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4221276559855802, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7134, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.3655578499295446, + "learning_rate": 4.586985643347717e-05, + "loss": 0.5917, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.434260553186921, + "learning_rate": 4.572463804170263e-05, + "loss": 0.6226, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.41927843946365817, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6162, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.43675092772193574, + "learning_rate": 4.543468791472131e-05, + "loss": 0.6711, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.4482351276308452, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.6319, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.45343576093395144, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7511, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.4028826179925407, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.6549, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.48046840157404724, + "learning_rate": 4.485674639850333e-05, + "loss": 0.6538, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.42045322713418654, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6759, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.44657651646268753, + "learning_rate": 4.456876191254582e-05, + "loss": 0.7518, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.4556318718130518, + "learning_rate": 4.442501774383515e-05, + "loss": 0.7657, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4397741660582045, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7605, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.4531289604393625, + "learning_rate": 4.413802770115816e-05, + "loss": 0.7293, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.46589715983901325, + "learning_rate": 4.399478268418771e-05, + "loss": 0.6772, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.5073510560869048, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6744, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.4157927043911074, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.6682, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.4770391448646444, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6817, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4801890769977989, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6698, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.4734320911245157, + "learning_rate": 4.328107473805487e-05, + "loss": 0.7149, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.4420285003704658, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.6655, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.42934215063887937, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6605, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.5043512124137548, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.7483, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.48819416093841506, + "learning_rate": 4.271315449981934e-05, + "loss": 0.7453, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.42948686180896956, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6741, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.4634141322694243, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.6771, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.4040112591373145, + "learning_rate": 4.228900904120895e-05, + "loss": 0.6696, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.4972940744230627, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7481, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.4779896136914934, + "learning_rate": 4.200710636738189e-05, + "loss": 0.7624, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.35441590514537924, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.587, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.44120062830045637, + "learning_rate": 4.172589639536991e-05, + "loss": 0.671, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.3569951734313286, + "learning_rate": 4.158555222253771e-05, + "loss": 0.6416, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.4853865033135809, + "learning_rate": 4.14453824841132e-05, + "loss": 0.7498, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.5425103499740149, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6326, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.49556347361308867, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.6519, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.5412599360095961, + "learning_rate": 4.102592405835536e-05, + "loss": 0.6511, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4397286600871548, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7173, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.36734758083748076, + "learning_rate": 4.074716493968975e-05, + "loss": 0.6393, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.4783721529678446, + "learning_rate": 4.060805057932359e-05, + "loss": 0.7696, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.40301976864384675, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6797, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.35473872328811157, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.6079, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.4013938109382642, + "learning_rate": 4.019177327749822e-05, + "loss": 0.6165, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.44058459786803794, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7382, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.5328855124479709, + "learning_rate": 3.991514736790258e-05, + "loss": 0.651, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.4455698946397586, + "learning_rate": 3.977710334046193e-05, + "loss": 0.7481, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.43625608319365694, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6686, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.39478402973427, + "learning_rate": 3.950155520139581e-05, + "loss": 0.6772, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.4447714164559942, + "learning_rate": 3.936405191259891e-05, + "loss": 0.6152, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3709666292725546, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6522, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.49935376346546406, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.5718, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.5193097691899753, + "learning_rate": 3.895263009479534e-05, + "loss": 0.7117, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.49367808147553527, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6369, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.4446072810452819, + "learning_rate": 3.867925968395085e-05, + "loss": 0.6319, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.44230619891195616, + "learning_rate": 3.854284894414122e-05, + "loss": 0.7182, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.38420112215989166, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6216, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.5019080492723267, + "learning_rate": 3.82705784324618e-05, + "loss": 0.6856, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.40799296100479354, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.6419, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.36619573541564876, + "learning_rate": 3.79990452539225e-05, + "loss": 0.5831, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.4294973033120836, + "learning_rate": 3.786355617847385e-05, + "loss": 0.701, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.42997115843154643, + "learning_rate": 3.772825265187802e-05, + "loss": 0.6917, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5505262459058446, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6789, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.4484169997765625, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.6884, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.3253818373046138, + "learning_rate": 3.732345940279893e-05, + "loss": 0.5682, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.5005046168097154, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7798, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.4535851767705107, + "learning_rate": 3.705453237352227e-05, + "loss": 0.6535, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.37017306277182493, + "learning_rate": 3.692035060534088e-05, + "loss": 0.6378, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4659719483337839, + "learning_rate": 3.678635720256737e-05, + "loss": 0.686, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.39666192885243245, + "learning_rate": 3.665255256532638e-05, + "loss": 0.6586, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.3715412733899542, + "learning_rate": 3.651893709317887e-05, + "loss": 0.6385, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.4029055369944867, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6223, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.3572924866367461, + "learning_rate": 3.625227523958252e-05, + "loss": 0.5971, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.4426536277758904, + "learning_rate": 3.611922965442648e-05, + "loss": 0.6824, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4441399528703781, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6384, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.43418683226601634, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.6839, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.4411283154623925, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.6961, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4195467102302851, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7042, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.39319018327375055, + "learning_rate": 3.545687101972013e-05, + "loss": 0.6923, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.4232049942394652, + "learning_rate": 3.53249759200601e-05, + "loss": 0.6429, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4127115823043955, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6523, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.4071491423804467, + "learning_rate": 3.506176550233863e-05, + "loss": 0.7078, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.38089821859697426, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6483, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.4405816078153508, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6595, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.4887840782636558, + "learning_rate": 3.46684052203088e-05, + "loss": 0.6609, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.5416620265321336, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.7102, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.45612068099197606, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6926, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.41827704069272087, + "learning_rate": 3.427680074531113e-05, + "loss": 0.6795, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.4216592821850561, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.6102, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.5968250083973519, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7302, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.410665467136115, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6755, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.4695062858668635, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.738, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.46271244818895624, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6636, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.4015570788728205, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.6179, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.48364235248570314, + "learning_rate": 3.336994413891828e-05, + "loss": 0.7225, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3973194957699503, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6019, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.40642190153774327, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.6479, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.42111818419243835, + "learning_rate": 3.298426809706928e-05, + "loss": 0.6101, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.479618840905392, + "learning_rate": 3.285610914348332e-05, + "loss": 0.7012, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.572763218453302, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.753, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.42415380076495746, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.6656, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.44506141267476257, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6995, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.42713836436030106, + "learning_rate": 3.234548216567049e-05, + "loss": 0.7182, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.3998359784462856, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.6857, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.5075873742854828, + "learning_rate": 3.209137931341143e-05, + "loss": 0.5661, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.43959844272118975, + "learning_rate": 3.196463187590929e-05, + "loss": 0.6635, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.45097273226825346, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.6798, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4528735769012832, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6903, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.4493233856167751, + "learning_rate": 3.158561005793402e-05, + "loss": 0.7353, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 1.1432675217729333, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6526, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4719850237881296, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6482, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.4575611421586871, + "learning_rate": 3.120842689807468e-05, + "loss": 0.6597, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.3955243959116776, + "learning_rate": 3.108310952230212e-05, + "loss": 0.6432, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.4209895420383914, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6398, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.436750904523161, + "learning_rate": 3.083309253324651e-05, + "loss": 0.6819, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.44006405567494983, + "learning_rate": 3.070839366655215e-05, + "loss": 0.6409, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.41012660114209337, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6675, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.40466806228202684, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.6112, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.4320520288424673, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6214, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.6547957865987912, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7243, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.493665262749511, + "learning_rate": 3.008801048763914e-05, + "loss": 0.6408, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.46362516582819296, + "learning_rate": 2.996455867635155e-05, + "loss": 0.7521, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4838596305339148, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7279, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.40205261407352516, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6594, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.6608564183232708, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.7107, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.40477314830392686, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6404, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.45862989831373807, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6943, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.4499846395944625, + "learning_rate": 2.922825253307947e-05, + "loss": 0.6938, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5009883034739381, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7392, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.39787126435833514, + "learning_rate": 2.898450393337977e-05, + "loss": 0.6446, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.4962610327432991, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.6979, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.4112045306396125, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6257, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.4206304928677344, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.7027, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.42846636415891637, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.6885, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3764584034790101, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6238, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.40290372899365723, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.6628, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.3824708549754406, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.6348, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4431141193201716, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6318, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.37440098757183116, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.638, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.5076540492769912, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.7331, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.43367732412243143, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.5986, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.39999747699379296, + "learning_rate": 2.753992680872457e-05, + "loss": 0.6826, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.4122860559375757, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6402, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.5332219220904169, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6996, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.39457104219654593, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.6206, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.4125437184555351, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.6496, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.4434116647478168, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6972, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.5242067106090085, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.6346, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.48697919458894945, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.6727, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.6840449583991132, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6906, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.5723928648456147, + "learning_rate": 2.647690737490106e-05, + "loss": 0.7639, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.4842751313789253, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.6301, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.37937441711304115, + "learning_rate": 2.6243086879379e-05, + "loss": 0.663, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.45872658764132634, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.7325, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.4032654765599899, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6517, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.6299189974694519, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7193, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.3459084928602634, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6186, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.40655230914119433, + "learning_rate": 2.566239608465838e-05, + "loss": 0.6147, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.47992854967493603, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.668, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.40885929670105964, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6854, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.562744141891325, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.7349, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.579209398186038, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6996, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.40889853330697956, + "learning_rate": 2.508725484101684e-05, + "loss": 0.5663, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.4039180918173561, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.5978, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4260067864950819, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6329, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.35314156930204316, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.5872, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.40319276048692143, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.6405, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.43965680601175855, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6824, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.4064539636314437, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.6392, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.4483840815129231, + "learning_rate": 2.429146201687538e-05, + "loss": 0.7326, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4523115976497022, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6374, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.5091395654116365, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6796, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.4599915732411999, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.6411, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.42810617569498916, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6739, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.42906586147037934, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.6877, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.38841250504046165, + "learning_rate": 2.361816641743303e-05, + "loss": 0.6279, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.4342986841252191, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7165, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.38872010965643383, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6231, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.5407543818226905, + "learning_rate": 2.328459328616759e-05, + "loss": 0.6601, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.444503105857603, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6142, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.379561000692178, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6243, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.4121710483336557, + "learning_rate": 2.295308190543859e-05, + "loss": 0.6316, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4691290071900537, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6443, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.4055354723342302, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.6107, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.3929569369202823, + "learning_rate": 2.262364118471805e-05, + "loss": 0.6733, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.4814978653308864, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6407, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.3981649138803261, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.6531, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.36836314340158854, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.6226, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4481966583544684, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7159, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.3500837632158312, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6112, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.4783183264793033, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.635, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.4443236553599415, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6441, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.4541438465337239, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.6763, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.4718351012226869, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.6387, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4241830736068816, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6314, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.4680461026945068, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6812, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.7163228104444151, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.6863, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.4554616306459936, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.615, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.4007787544244606, + "learning_rate": 2.111388852214001e-05, + "loss": 0.6999, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.5228251349902756, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.7226, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4230339780418657, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6562, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.4289200465615318, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.6975, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.469280677300091, + "learning_rate": 2.069097260929439e-05, + "loss": 0.6408, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4367332567861892, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6777, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.3819927261154882, + "learning_rate": 2.048093436450603e-05, + "loss": 0.6572, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.545684475160914, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.7169, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.3756034309671269, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6111, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.47738980941475234, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.6274, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.4614568106789699, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.7233, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4170144770443061, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6834, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.4378867660574267, + "learning_rate": 1.985652854842247e-05, + "loss": 0.6897, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.6109643981863047, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.6896, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4362002872919862, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6262, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.44474390683765214, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.5892, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.5702420791951768, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.7219, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4787500446260359, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6837, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.48508785097643087, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.67, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.42022236520136474, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.7159, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4527569141085781, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6423, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.39016311585877517, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.6467, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.5433799100774432, + "learning_rate": 1.883503039577894e-05, + "loss": 0.6036, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.38182838533340996, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.606, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.4369063274675231, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.6541, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.46879166057021016, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.6798, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3369820745212901, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.5896, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.5947328223831719, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.772, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.4633678727479024, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.6713, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.5685596062239611, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.73, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.4081445228850184, + "learning_rate": 1.803526775107217e-05, + "loss": 0.6344, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.44003364356713587, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.6511, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.5786424582544375, + "learning_rate": 1.783776873795994e-05, + "loss": 0.5641, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.43733236504905026, + "learning_rate": 1.773938710748706e-05, + "loss": 0.665, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.4502124006287469, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.6443, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.44235441651659346, + "learning_rate": 1.754336106761927e-05, + "loss": 0.65, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.5184934723274678, + "learning_rate": 1.744571724358789e-05, + "loss": 0.6528, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.4166153228231402, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.6568, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.5684615427199803, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7484, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.3977201241748141, + "learning_rate": 1.715426605184407e-05, + "loss": 0.6442, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.48124047636989997, + "learning_rate": 1.705761004839911e-05, + "loss": 0.7476, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.6256985102564921, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7582, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.4135303417301385, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6429, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.4542126357851726, + "learning_rate": 1.676912926028007e-05, + "loss": 0.6585, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3936954529296244, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.7135, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.48552027393072117, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.6239, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.4455902312851424, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.5849, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4632386386190594, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7358, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.4750722058952672, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.6443, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.3797520738465224, + "learning_rate": 1.619888594394382e-05, + "loss": 0.6288, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4437672820295281, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6568, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.41281488934315297, + "learning_rate": 1.601080376443763e-05, + "loss": 0.6181, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.6409353771099737, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.6241, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.4639950500009388, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7206, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.4341948855687017, + "learning_rate": 1.573056222621453e-05, + "loss": 0.6599, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.4144951273474816, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.6564, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.48211622062561166, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7183, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.5195647645247848, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.7231, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.567201243175725, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6932, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4730026109960757, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6919, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.45433871843404566, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6607, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.4097757902258825, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.6481, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4445355264901289, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6355, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.44696609144607774, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.6668, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.411072395882937, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.6622, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.42983422126722076, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7323, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.4965015650726598, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.6629, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.43943615031041566, + "learning_rate": 1.454244833620102e-05, + "loss": 0.6918, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.3798145018855565, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.64, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.4237693029700599, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.6481, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.4354980981388452, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.6503, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.49188118000721065, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6602, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.5150270739089561, + "learning_rate": 1.409693244743192e-05, + "loss": 0.7321, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.4432625754648843, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.686, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5666187564839488, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7402, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.3587049280419388, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.5906, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.3913212053081317, + "learning_rate": 1.37451354812416e-05, + "loss": 0.6042, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4018650792169943, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6515, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.3905729772016787, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.6916, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.45168546096280476, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.6439, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.39448987239315225, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6338, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.41538214396139544, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6622, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.3518625690462678, + "learning_rate": 1.322517230541096e-05, + "loss": 0.5916, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.5183792247040455, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6675, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.37766917561812363, + "learning_rate": 1.30539214797198e-05, + "loss": 0.6529, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.3643754756719974, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.5925, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.373712533344589, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6373, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.48997948446422157, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.6207, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.47513872374997856, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.6708, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.5118449162386708, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6799, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.3854080092446448, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.6293, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.43422935312312305, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6373, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4173380998324428, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6201, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.38922889827805607, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.6496, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.4887497044658339, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.7261, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.34874728843648506, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.5998, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.412856453140932, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.6768, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.4742852328898488, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.6613, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.4380092885638821, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6444, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.4284426210759624, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.6596, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.5193185731149842, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.7499, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4049456112109701, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6365, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.4543968233765363, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.7466, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.48525162102566444, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.7101, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.49142803118979916, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7279, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.4121167023556667, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.612, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.4392176348033429, + "learning_rate": 1.123914688596409e-05, + "loss": 0.6512, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3763130191660776, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6198, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.41785549470405486, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.622, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.41218349773786217, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.6049, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.46669989087698827, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6865, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.4355246424305574, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.6605, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.389477566637251, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.6501, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.44038632705053193, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.7111, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.4806305889169313, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.6465, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.46298817064117004, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.6892, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5003699140691916, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7541, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.4394608676070686, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.6181, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.4829426268887746, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.7067, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.41210876324544893, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6117, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.42686456913939924, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.6556, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.3866631681828193, + "learning_rate": 1.007519208596045e-05, + "loss": 0.6473, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.42655242487824907, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6375, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.4389474578671961, + "learning_rate": 9.924546254786493e-06, + "loss": 0.6484, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.3820630345668428, + "learning_rate": 9.849626695403324e-06, + "loss": 0.6199, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4046582607113642, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6241, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.4311763839878247, + "learning_rate": 9.700595407649805e-06, + "loss": 0.6651, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.4542880511580375, + "learning_rate": 9.62648412430951e-06, + "loss": 0.6579, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.4120580504746446, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6905, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.4884266870435224, + "learning_rate": 9.479071385238892e-06, + "loss": 0.6292, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.43728759161149866, + "learning_rate": 9.40577036970538e-06, + "loss": 0.7067, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.439111482727896, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6407, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.4845461222834999, + "learning_rate": 9.259980141081115e-06, + "loss": 0.7314, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.42911042479983624, + "learning_rate": 9.187491363342093e-06, + "loss": 0.6611, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3968703255307971, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6534, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.38961375599421977, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6134, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.46070695937734074, + "learning_rate": 8.971652971536148e-06, + "loss": 0.6964, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.48319645317624976, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6803, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.43922212202329997, + "learning_rate": 8.829119474567671e-06, + "loss": 0.6305, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.4385169927193048, + "learning_rate": 8.758260995011825e-06, + "loss": 0.6984, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4530229102352299, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6638, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.513428802989218, + "learning_rate": 8.617361631727138e-06, + "loss": 0.7451, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.40863775505880795, + "learning_rate": 8.547321168745193e-06, + "loss": 0.6235, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.5514454993610067, + "learning_rate": 8.47755379734373e-06, + "loss": 0.769, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.4075714742225624, + "learning_rate": 8.408059725858719e-06, + "loss": 0.602, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.4042193874742804, + "learning_rate": 8.338839161809997e-06, + "loss": 0.6839, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.38368033873891205, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6281, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.47887727965872756, + "learning_rate": 8.201219382016556e-06, + "loss": 0.6253, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.41406932588159323, + "learning_rate": 8.132820577225387e-06, + "loss": 0.5932, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.4321653709688338, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6539, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.4325485273086699, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6799, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.49747056271681245, + "learning_rate": 7.929270951805178e-06, + "loss": 0.7104, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4124555922656816, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6821, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.4278702974851724, + "learning_rate": 7.794945549701993e-06, + "loss": 0.6409, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.5268838421986082, + "learning_rate": 7.728195756009204e-06, + "loss": 0.6701, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.3356705873094502, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6254, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.5115665554429503, + "learning_rate": 7.595522979965819e-06, + "loss": 0.7473, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.5549275423071007, + "learning_rate": 7.529600393796232e-06, + "loss": 0.634, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.4129681953465492, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6858, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.4334896205553804, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.7082, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.37738828750884534, + "learning_rate": 7.333490202478666e-06, + "loss": 0.6513, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.7062015918845946, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.69, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.3933121471822579, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6891, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.46153451371390386, + "learning_rate": 7.1398704525792e-06, + "loss": 0.5924, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4619194720743569, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6446, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.40485432193236764, + "learning_rate": 7.012176770311862e-06, + "loss": 0.6527, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.46498987579859147, + "learning_rate": 6.948746347689183e-06, + "loss": 0.6156, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.4273623814044134, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6114, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.45165622762854996, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.6164, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.5411455976859217, + "learning_rate": 6.760123024328624e-06, + "loss": 0.7047, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4170236243872799, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5988, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.419468252827379, + "learning_rate": 6.635765971293484e-06, + "loss": 0.5675, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.42088947831228424, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.6784, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.42553680475341854, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6179, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.4851315382837286, + "learning_rate": 6.451321849032288e-06, + "loss": 0.6756, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.39282675919244703, + "learning_rate": 6.390398932093555e-06, + "loss": 0.6327, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.5588568816765518, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6282, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.4698368238260259, + "learning_rate": 6.269391876739495e-06, + "loss": 0.6312, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.4367667295862253, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6252, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.4577544590908375, + "learning_rate": 6.149504395842087e-06, + "loss": 0.7267, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.3917080039263942, + "learning_rate": 6.089980943839924e-06, + "loss": 0.5493, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.4628547087686605, + "learning_rate": 6.030737921409169e-06, + "loss": 0.645, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4100854716226644, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6528, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.469383484226112, + "learning_rate": 5.913093872058528e-06, + "loss": 0.7227, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.42939021894209495, + "learning_rate": 5.854693196441641e-06, + "loss": 0.6351, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.4587514440256765, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6145, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.40796818894275366, + "learning_rate": 5.738735415290642e-06, + "loss": 0.6208, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.5245166810352607, + "learning_rate": 5.681178656024055e-06, + "loss": 0.7941, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.41858576012266857, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6605, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.5029441424031276, + "learning_rate": 5.566910259474289e-06, + "loss": 0.654, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.39588977096060385, + "learning_rate": 5.510198963413881e-06, + "loss": 0.5754, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.43398608387611426, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6965, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.3956062769487065, + "learning_rate": 5.397623022464226e-06, + "loss": 0.6066, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.5799303774810434, + "learning_rate": 5.341758713743828e-06, + "loss": 0.7393, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.40473359878926773, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6818, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.4557557799799093, + "learning_rate": 5.230878253907912e-06, + "loss": 0.7586, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.48245542943285297, + "learning_rate": 5.175862433898282e-06, + "loss": 0.6346, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.5110922381694472, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6719, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.3613012154317809, + "learning_rate": 5.066680435123106e-06, + "loss": 0.6581, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.41451177222561064, + "learning_rate": 5.012514582391592e-06, + "loss": 0.7078, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.43499608088847636, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6433, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.4059383660825497, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6467, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.5582984266495177, + "learning_rate": 4.851719549248301e-06, + "loss": 0.7557, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.4954697061395202, + "learning_rate": 4.798689246727006e-06, + "loss": 0.7238, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.43864156821664785, + "learning_rate": 4.745943229770122e-06, + "loss": 0.643, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.49264354917470277, + "learning_rate": 4.693481655885257e-06, + "loss": 0.6634, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.6162265954390206, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7582, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.4710076943669117, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6638, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.40355569907650257, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6128, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3829409072081762, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6676, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.46670947512097, + "learning_rate": 4.435445885824285e-06, + "loss": 0.6814, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.4542642570304578, + "learning_rate": 4.384694230432984e-06, + "loss": 0.6082, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.38769564929516004, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6271, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.5254532367814335, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.7208, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.3977113454905695, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.6636, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4424789105180282, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7182, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.440539538012321, + "learning_rate": 4.135221781914034e-06, + "loss": 0.6679, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.4212358326675856, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.659, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.43795798542433434, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5912, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.42344934101655535, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6783, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.3830263313919214, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.5908, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.39445880826051793, + "learning_rate": 3.892905960127546e-06, + "loss": 0.5571, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.5477526778829184, + "learning_rate": 3.845303192289074e-06, + "loss": 0.6966, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.385803456198678, + "learning_rate": 3.797987556970495e-06, + "loss": 0.5765, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.430827174675466, + "learning_rate": 3.750959195463466e-06, + "loss": 0.658, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.47765928045648276, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.6866, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.42680757206130226, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.6118, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.5240190067743186, + "learning_rate": 3.611599153858214e-06, + "loss": 0.7096, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.4836966132884837, + "learning_rate": 3.565721283350931e-06, + "loss": 0.6967, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.5477118813907119, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.7548, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.4053454839277045, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6784, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.44014920474258146, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6489, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.39387381096905677, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.6563, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.43899280415011493, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7034, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.5309862604436678, + "learning_rate": 3.296506110302422e-06, + "loss": 0.7453, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.4032446104401672, + "learning_rate": 3.252646840332918e-06, + "loss": 0.6091, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.402482542954159, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6215, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.3656568731027382, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6173, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.3649188766171377, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.6313, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.404074350484019, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6398, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.7792532589315108, + "learning_rate": 3.037686613916857e-06, + "loss": 0.6445, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.4072870206549389, + "learning_rate": 2.995562691985898e-06, + "loss": 0.6979, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5802046062782825, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7653, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.6134815136581775, + "learning_rate": 2.912183982969385e-06, + "loss": 0.6964, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.4566500800470463, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.6755, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3889385811942882, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6367, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.42132189571813033, + "learning_rate": 2.789290617426765e-06, + "loss": 0.6709, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.5502658206991807, + "learning_rate": 2.748906571878207e-06, + "loss": 0.6844, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.4109979916703212, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6656, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.5349626472192117, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.6792, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.3929259820512547, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.5987, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4238283605425105, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6342, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.3474079856752595, + "learning_rate": 2.551344823532964e-06, + "loss": 0.5476, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.4366692181154197, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.6367, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.44436214778993216, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6152, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.3348354336295145, + "learning_rate": 2.436298790049363e-06, + "loss": 0.5874, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.38847198321126125, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.6504, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.40424377782673143, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6582, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.41002965922987233, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.6866, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.4067090297962575, + "learning_rate": 2.286983355164529e-06, + "loss": 0.6706, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.417347825399175, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6539, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.4726019554872595, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.6016, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.41764107695884006, + "learning_rate": 2.178060137750071e-06, + "loss": 0.6043, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.36693776341659945, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6144, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.39934235720381156, + "learning_rate": 2.106905034576112e-06, + "loss": 0.6735, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.4965655480591488, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.7142, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.42875820347453275, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6845, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.4055547482098708, + "learning_rate": 2.002365067264289e-06, + "loss": 0.5752, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.49959247579552135, + "learning_rate": 1.968103545249611e-06, + "loss": 0.6544, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.43426902373524473, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6196, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.4382330506712958, + "learning_rate": 1.900458817025097e-06, + "loss": 0.5806, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.45469155884809576, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.6073, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.42681364610491224, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6396, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.4478423617494056, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.6639, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.40181955384189383, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.6568, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.4081222757236124, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6855, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.4236295476459045, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.6624, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.4412142904850606, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.7157, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.4643292919097104, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6453, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.47109321823024464, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.643, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.4169201898978646, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.6233, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.4420269742790322, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.621, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.5310896307326314, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.6592, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.57693411108257, + "learning_rate": 1.489364501100332e-06, + "loss": 0.7531, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.40027668439784575, + "learning_rate": 1.459798471131868e-06, + "loss": 0.63, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.45554959525323824, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6944, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.5800681695499549, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.6839, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.45277302383436674, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6468, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.45271890416023614, + "learning_rate": 1.344477780953346e-06, + "loss": 0.671, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.42363472987303374, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.6872, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.44406485937332685, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6385, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.5587188392936556, + "learning_rate": 1.261080262743297e-06, + "loss": 0.6416, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.34896010757851925, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.5476, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.37165423361942146, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.5964, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.5551780434620531, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.8058, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.4459932791245074, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6536, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4417695511572837, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6405, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.45979733840097287, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.6757, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.418860251545236, + "learning_rate": 1.076809502472831e-06, + "loss": 0.7194, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.43516647592718966, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6277, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.4298094963049132, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6526, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.4003875180697773, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6357, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.40805824598866824, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6297, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.37244563492274635, + "learning_rate": 9.540479264726676e-07, + "loss": 0.5876, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.4177365233111911, + "learning_rate": 9.303826211592315e-07, + "loss": 0.6027, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4353412020431162, + "learning_rate": 9.070131527609604e-07, + "loss": 0.69, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.4120364477361252, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6957, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.4239066267875004, + "learning_rate": 8.611620049653879e-07, + "loss": 0.6385, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.5342632200761349, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7627, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.36817388197940243, + "learning_rate": 8.16495030759501e-07, + "loss": 0.5786, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.48533062698194884, + "learning_rate": 7.946057760332193e-07, + "loss": 0.6796, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.37074100093183415, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6123, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.4141493977118879, + "learning_rate": 7.517160581569372e-07, + "loss": 0.7048, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.415217243574443, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6546, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3910056185139864, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6141, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.5974574992263897, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6627, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.5139361713417283, + "learning_rate": 6.694935631773258e-07, + "loss": 0.6813, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 1.1734032114524424, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6291, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.47747074418518726, + "learning_rate": 6.301617681886863e-07, + "loss": 0.7475, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.4311561298803717, + "learning_rate": 6.109409416834688e-07, + "loss": 0.6713, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.504562022783302, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6561, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.4773157125725159, + "learning_rate": 5.733897176325665e-07, + "loss": 0.6655, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.38604019845998105, + "learning_rate": 5.550594322205504e-07, + "loss": 0.6234, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.3945973662477523, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6264, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.43171237307055293, + "learning_rate": 5.192897883082747e-07, + "loss": 0.6261, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.3528323500298493, + "learning_rate": 5.018505366216175e-07, + "loss": 0.5892, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4019638365618088, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6306, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.4045721317863742, + "learning_rate": 4.678634341683252e-07, + "loss": 0.6476, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.3957708441520497, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6462, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.47367736399522625, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.65, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.5023741182081118, + "learning_rate": 4.191120373120749e-07, + "loss": 0.6989, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.4551564703113851, + "learning_rate": 4.034562351727389e-07, + "loss": 0.5782, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.42327891958627145, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.7395, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.48474062249707706, + "learning_rate": 3.73036907948543e-07, + "loss": 0.6498, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.5051658504621682, + "learning_rate": 3.582734737004101e-07, + "loss": 0.6711, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3645228532726314, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6617, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.3975382127868151, + "learning_rate": 3.296392843612273e-07, + "loss": 0.6635, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.4276411180800346, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.6609, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.43384637336354825, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.637, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.4387910782380915, + "learning_rate": 2.889203328748424e-07, + "loss": 0.6301, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.4999299860408389, + "learning_rate": 2.759428007315212e-07, + "loss": 0.6644, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3880675231249164, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.5845, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.44004972785404506, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.6673, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.4460458904659256, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.6807, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.39196638469319506, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.5982, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.4268942353014067, + "learning_rate": 2.15522751523467e-07, + "loss": 0.5791, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.4165890060081552, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.6684, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4198533615173099, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6663, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.428303158648751, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.6244, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.44118042394269336, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.6533, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.462106473691002, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6862, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.45822369136078545, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.6385, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.4416151139936162, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.5906, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.45141304264926185, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6291, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.4248290538485947, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6195, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.4654856422485027, + "learning_rate": 1.170343437301491e-07, + "loss": 0.7059, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3657999180986568, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5697, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.45784132439167596, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.6836, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.5252729583036887, + "learning_rate": 9.330275400666332e-08, + "loss": 0.6651, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.4437070604664506, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6383, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.48425128455025135, + "learning_rate": 7.8973337634336e-08, + "loss": 0.6486, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.44305311958641147, + "learning_rate": 7.225618800222877e-08, + "loss": 0.702, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4719295320937795, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5969, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.45356385639317337, + "learning_rate": 5.971710613821291e-08, + "loss": 0.667, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.4223167880531397, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6455, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.443812423913915, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6557, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.4098485018285767, + "learning_rate": 4.314680098592705e-08, + "loss": 0.6313, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.4181108336526171, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.6448, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.4616096472968845, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6449, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.43449513413519597, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.6886, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.4735156314757002, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.663, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3816892624469625, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6468, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.4392699187162705, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.698, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.4374484167172048, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.6153, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4643901351796245, + "learning_rate": 1.209367398504746e-08, + "loss": 0.5761, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.44992439538756956, + "learning_rate": 9.555535917993297e-09, + "loss": 0.7871, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.4749681289221362, + "learning_rate": 7.315984495548378e-09, + "loss": 0.6527, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.37467779687969793, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6104, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.4132835221926209, + "learning_rate": 3.732667443390181e-09, + "loss": 0.6794, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.5321217168579051, + "learning_rate": 2.388912514017516e-09, + "loss": 0.6663, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.41446334553886355, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5672, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.46896441580567816, + "learning_rate": 5.972299119250125e-10, + "loss": 0.6666, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.5147065885708658, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.6069, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.44285490265539945, + "learning_rate": 0.0, + "loss": 0.6387, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1615624508571648.0, + "train_loss": 0.7278803406397502, + "train_runtime": 28964.1933, + "train_samples_per_second": 1.036, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1615624508571648.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e899733f07bd8e65f6027369fa419218820e94d --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "q_proj", + "down_proj", + "up_proj", + "v_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3567871915d25449d57d4fb0d14d5ebacb292356 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e78c71e82a882872cc4892cd944059d50f1d4b2184eb25c2484af31f6223d2 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbfcbdb872aa490da15788530450efd5b4eecc48 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c06f078c202ca187faae7464d772cd8ccc710939433a26fed21074276a4b4e38 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3df8d424a3af07f79fc45f0abb5b44fafe033db6 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9363979537893283, + "learning_rate": 2e-05, + "loss": 1.326, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8117640234578937, + "learning_rate": 4e-05, + "loss": 1.2227, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7794277562491366, + "learning_rate": 6e-05, + "loss": 1.2748, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7435848597042757, + "learning_rate": 8e-05, + "loss": 1.2728, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.6152566686715281, + "learning_rate": 0.0001, + "loss": 0.9413, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.9534557360712878, + "learning_rate": 0.00012, + "loss": 1.1385, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8954079865789398, + "learning_rate": 0.00014, + "loss": 0.9773, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.9440691389909568, + "learning_rate": 0.00016, + "loss": 1.1365, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6351021865944403, + "learning_rate": 0.00018, + "loss": 0.9804, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.6133804482420512, + "learning_rate": 0.0002, + "loss": 0.8835, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5663860089193683, + "learning_rate": 0.00019999458931878073, + "loss": 0.8981, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.49963909488217073, + "learning_rate": 0.0001999783578606323, + "loss": 0.8886, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5599956879255387, + "learning_rate": 0.00019995130738201966, + "loss": 0.9021, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6737809693905482, + "learning_rate": 0.0001999134408101731, + "loss": 0.9343, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.6902953980011141, + "learning_rate": 0.00019986476224277165, + "loss": 0.9473, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.603551811924562, + "learning_rate": 0.00019980527694749952, + "loss": 0.9259, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5540107660433663, + "learning_rate": 0.00019973499136147606, + "loss": 0.846, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5330555970663188, + "learning_rate": 0.0001996539130905593, + "loss": 0.8927, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5804294290736568, + "learning_rate": 0.0001995620509085228, + "loss": 1.0149, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.4956640641721262, + "learning_rate": 0.00019945941475610623, + "loss": 0.8418, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4996576021612778, + "learning_rate": 0.0001993460157399396, + "loss": 0.915, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.49443061973090485, + "learning_rate": 0.0001992218661313415, + "loss": 0.9021, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5409978992320507, + "learning_rate": 0.00019908697936499103, + "loss": 0.9499, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.45638969485150216, + "learning_rate": 0.00019894137003747403, + "loss": 0.8657, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.5028902860236372, + "learning_rate": 0.00019878505390570362, + "loss": 0.8335, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.49120503529481024, + "learning_rate": 0.00019861804788521493, + "loss": 0.7453, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5840564683830789, + "learning_rate": 0.00019844037004833473, + "loss": 0.9129, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.503872495625192, + "learning_rate": 0.00019825203962222572, + "loss": 0.8842, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.5430006396028205, + "learning_rate": 0.0001980530769868059, + "loss": 0.8861, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.5944535180864793, + "learning_rate": 0.00019784350367254322, + "loss": 0.9127, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.477912643699261, + "learning_rate": 0.0001976233423581255, + "loss": 0.8509, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.6145012676574753, + "learning_rate": 0.0001973926168680066, + "loss": 0.9374, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5612215454429993, + "learning_rate": 0.00019715135216982798, + "loss": 0.8671, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5296079193299826, + "learning_rate": 0.0001968995743717171, + "loss": 0.8027, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.5766512796370208, + "learning_rate": 0.00019663731071946206, + "loss": 0.9182, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.6273138421530724, + "learning_rate": 0.00019636458959356316, + "loss": 0.9288, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5084629286636579, + "learning_rate": 0.0001960814405061619, + "loss": 0.7973, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4488607448801517, + "learning_rate": 0.00019578789409784727, + "loss": 0.8201, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5448883697942045, + "learning_rate": 0.00019548398213434007, + "loss": 0.9148, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.4557519242439591, + "learning_rate": 0.00019516973750305532, + "loss": 0.8354, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4880677089404733, + "learning_rate": 0.00019484519420954354, + "loss": 0.745, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5357339810746296, + "learning_rate": 0.00019451038737381077, + "loss": 0.866, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5016337897862774, + "learning_rate": 0.00019416535322651818, + "loss": 0.7784, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5689677962703366, + "learning_rate": 0.00019381012910506146, + "loss": 0.8541, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.5538540615030474, + "learning_rate": 0.00019344475344953012, + "loss": 0.899, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.8508959404814713, + "learning_rate": 0.00019306926579854821, + "loss": 0.8346, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5666950364950541, + "learning_rate": 0.00019268370678499533, + "loss": 0.9008, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.49530552873035577, + "learning_rate": 0.0001922881181316097, + "loss": 0.8171, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.44079239666446396, + "learning_rate": 0.00019188254264647337, + "loss": 0.8425, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.5430257415754854, + "learning_rate": 0.0001914670242183795, + "loss": 0.8827, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5122734653026103, + "learning_rate": 0.0001910416078120832, + "loss": 0.8715, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.47037923716378616, + "learning_rate": 0.0001906063394634356, + "loss": 0.8428, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4071915604324042, + "learning_rate": 0.00019016126627440237, + "loss": 0.7873, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4930792135935222, + "learning_rate": 0.00018970643640796642, + "loss": 0.8473, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.46072983553161256, + "learning_rate": 0.000189241899082916, + "loss": 0.7934, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4724090002512796, + "learning_rate": 0.00018876770456851877, + "loss": 0.7856, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5424387356431846, + "learning_rate": 0.0001882839041790818, + "loss": 0.9459, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.49540806475580534, + "learning_rate": 0.00018779055026839868, + "loss": 0.7716, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5035931795876943, + "learning_rate": 0.00018728769622408423, + "loss": 0.7834, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.8131080597684293, + "learning_rate": 0.00018677539646179707, + "loss": 0.7759, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5163423632671406, + "learning_rate": 0.00018625370641935129, + "loss": 0.827, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4291452218834803, + "learning_rate": 0.00018572268255071718, + "loss": 0.8182, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.45224199709813434, + "learning_rate": 0.00018518238231991218, + "loss": 0.813, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4261090870298, + "learning_rate": 0.00018463286419478255, + "loss": 0.7053, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.5167876729083449, + "learning_rate": 0.00018407418764067627, + "loss": 0.8275, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4313552859242522, + "learning_rate": 0.00018350641311400812, + "loss": 0.8171, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.45289041539428, + "learning_rate": 0.0001829296020557174, + "loss": 0.8492, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5166261174988512, + "learning_rate": 0.00018234381688461942, + "loss": 0.8338, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4157435641812972, + "learning_rate": 0.0001817491209906506, + "loss": 0.7917, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.491402132048252, + "learning_rate": 0.00018114557872800905, + "loss": 0.7981, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4445528461657566, + "learning_rate": 0.00018053325540819045, + "loss": 0.9052, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4197447499086703, + "learning_rate": 0.0001799122172929206, + "loss": 0.7493, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4886253206372016, + "learning_rate": 0.00017928253158698473, + "loss": 0.8029, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4765189446925175, + "learning_rate": 0.0001786442664309554, + "loss": 0.8344, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.5144358719291028, + "learning_rate": 0.0001779974908938184, + "loss": 0.8304, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.46916722646695236, + "learning_rate": 0.0001773422749654988, + "loss": 0.8396, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4697171238614981, + "learning_rate": 0.00017667868954928694, + "loss": 0.8141, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4567021849997054, + "learning_rate": 0.00017600680645416583, + "loss": 0.7285, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4894482041948432, + "learning_rate": 0.00017532669838704035, + "loss": 0.8294, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.4904653881718381, + "learning_rate": 0.00017463843894486937, + "loss": 0.7718, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4234566601688781, + "learning_rate": 0.0001739421026067017, + "loss": 0.768, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.46261981904263455, + "learning_rate": 0.00017323776472561627, + "loss": 0.7512, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5106893649433253, + "learning_rate": 0.00017252550152056795, + "loss": 0.8489, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.6163005786610454, + "learning_rate": 0.0001718053900681397, + "loss": 0.8145, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.4674196739539739, + "learning_rate": 0.00017107750829420176, + "loss": 0.7815, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.48965758838373274, + "learning_rate": 0.00017034193496547902, + "loss": 0.8273, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.419009348485611, + "learning_rate": 0.00016959874968102735, + "loss": 0.7198, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5630703058755215, + "learning_rate": 0.00016884803286362, + "loss": 0.8807, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.5974320059787537, + "learning_rate": 0.00016808986575104465, + "loss": 0.9517, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.5106332295584806, + "learning_rate": 0.00016732433038731242, + "loss": 0.8321, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.5241845014581666, + "learning_rate": 0.0001665515096137797, + "loss": 0.8576, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3817337728587, + "learning_rate": 0.00016577148706018328, + "loss": 0.773, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.40242980611290113, + "learning_rate": 0.00016498434713559088, + "loss": 0.7608, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.49824254385091293, + "learning_rate": 0.00016419017501926656, + "loss": 0.8787, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.5597415955082202, + "learning_rate": 0.0001633890566514535, + "loss": 0.8821, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.41719321050637004, + "learning_rate": 0.00016258107872407375, + "loss": 0.7712, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.561733655171769, + "learning_rate": 0.0001617663286713474, + "loss": 0.9214, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.403603783227237, + "learning_rate": 0.00016094489466033043, + "loss": 0.8177, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.401241015246352, + "learning_rate": 0.00016011686558137448, + "loss": 0.8014, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.6093542375620725, + "learning_rate": 0.0001592823310385073, + "loss": 0.7716, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4174338432478432, + "learning_rate": 0.0001584413813397364, + "loss": 0.8355, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.560369224533605, + "learning_rate": 0.00015759410748727662, + "loss": 0.8222, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.48807236825344513, + "learning_rate": 0.00015674060116770236, + "loss": 0.6906, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.49877772882012494, + "learning_rate": 0.00015588095474202595, + "loss": 0.8299, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.5627159269452524, + "learning_rate": 0.00015501526123570277, + "loss": 0.8169, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5239948943616941, + "learning_rate": 0.00015414361432856475, + "loss": 0.7465, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.41041236366072353, + "learning_rate": 0.0001532661083446829, + "loss": 0.7824, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4348437359785736, + "learning_rate": 0.00015238283824216015, + "loss": 0.7417, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.6220227374639119, + "learning_rate": 0.00015149389960285558, + "loss": 0.8912, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.4296457112545859, + "learning_rate": 0.00015059938862204127, + "loss": 0.8072, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4360646428952031, + "learning_rate": 0.00014969940209799248, + "loss": 0.754, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5054723789375187, + "learning_rate": 0.00014879403742151283, + "loss": 0.7758, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.494290935202186, + "learning_rate": 0.00014788339256539544, + "loss": 0.8382, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.5333671479523171, + "learning_rate": 0.0001469675660738206, + "loss": 0.8051, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.4993376861957015, + "learning_rate": 0.00014604665705169237, + "loss": 0.8322, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4982647162083319, + "learning_rate": 0.00014512076515391375, + "loss": 0.752, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4423169139813519, + "learning_rate": 0.00014418999057460276, + "loss": 0.7763, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5349362617362463, + "learning_rate": 0.0001432544340362501, + "loss": 0.889, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4435089793619459, + "learning_rate": 0.00014231419677881966, + "loss": 0.7732, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.43091617490721384, + "learning_rate": 0.00014136938054879283, + "loss": 0.8183, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.444060503169897, + "learning_rate": 0.00014042008758815818, + "loss": 0.7883, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.41965491835732754, + "learning_rate": 0.00013946642062334766, + "loss": 0.8106, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.44284326089286935, + "learning_rate": 0.00013850848285411994, + "loss": 0.8015, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.39061979734847707, + "learning_rate": 0.000137546377942393, + "loss": 0.7657, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.39703513811657537, + "learning_rate": 0.00013658021000102636, + "loss": 0.7763, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5043580429456066, + "learning_rate": 0.00013561008358255468, + "loss": 0.8012, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.5629925641967249, + "learning_rate": 0.00013463610366787392, + "loss": 0.8638, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.5460601526584461, + "learning_rate": 0.00013365837565488064, + "loss": 0.8266, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.41985598118037926, + "learning_rate": 0.0001326770053470668, + "loss": 0.7357, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.5475399510700736, + "learning_rate": 0.0001316920989420703, + "loss": 0.8838, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 1.1006791888104086, + "learning_rate": 0.00013070376302018287, + "loss": 0.7982, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.38815953401271486, + "learning_rate": 0.00012971210453281674, + "loss": 0.7272, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4440664673903164, + "learning_rate": 0.000128717230790931, + "loss": 0.8308, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3734226061489481, + "learning_rate": 0.00012771924945341906, + "loss": 0.7624, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.43399956288340513, + "learning_rate": 0.00012671826851545851, + "loss": 0.7323, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.42319000811636015, + "learning_rate": 0.0001257143962968246, + "loss": 0.772, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.39397233302623236, + "learning_rate": 0.00012470774143016853, + "loss": 0.728, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3893213470005053, + "learning_rate": 0.00012369841284926188, + "loss": 0.7019, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.35227510257967026, + "learning_rate": 0.00012268651977720866, + "loss": 0.7391, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.47287328029829406, + "learning_rate": 0.00012167217171462566, + "loss": 0.9142, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.47962860635829735, + "learning_rate": 0.0001206554784277931, + "loss": 0.7615, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4579895901402684, + "learning_rate": 0.00011963654993677645, + "loss": 0.763, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5216593170553835, + "learning_rate": 0.00011861549650352069, + "loss": 0.8671, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5464788696394796, + "learning_rate": 0.00011759242861991855, + "loss": 0.8131, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.4167947720982017, + "learning_rate": 0.00011656745699585371, + "loss": 0.7709, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.49472030945838674, + "learning_rate": 0.00011554069254722051, + "loss": 0.8371, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3744782140453558, + "learning_rate": 0.00011451224638392129, + "loss": 0.7029, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4528021003842892, + "learning_rate": 0.00011348222979784289, + "loss": 0.8865, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4589123011711344, + "learning_rate": 0.00011245075425081328, + "loss": 0.7524, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.42839347958134205, + "learning_rate": 0.00011141793136253986, + "loss": 0.8261, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.36902286633529063, + "learning_rate": 0.0001103838728985307, + "loss": 0.7776, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.6244510375840064, + "learning_rate": 0.000109348690758, + "loss": 0.8404, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.44194539682664075, + "learning_rate": 0.00010831249696175918, + "loss": 0.7606, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4265631702274794, + "learning_rate": 0.0001072754036400944, + "loss": 0.7329, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.39222524745628834, + "learning_rate": 0.00010623752302063283, + "loss": 0.7171, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3705633072729717, + "learning_rate": 0.00010519896741619803, + "loss": 0.7362, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.44923869821756857, + "learning_rate": 0.00010415984921265609, + "loss": 0.7928, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3860490982345257, + "learning_rate": 0.00010312028085675391, + "loss": 0.6791, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4741707270491527, + "learning_rate": 0.00010208037484395114, + "loss": 0.8472, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.4254148763209672, + "learning_rate": 0.00010104024370624644, + "loss": 0.7223, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4413502994303793, + "learning_rate": 0.0001, + "loss": 0.7815, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.35458312224766875, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7146, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.4469590316769706, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7549, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4263386322112164, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7745, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.4149606139434068, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8145, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4256621117844649, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8236, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.39257904027478074, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7016, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4535928960965414, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7954, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4233331815902363, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7895, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.42189751090775524, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7161, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.43914005287019414, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7527, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.42567239550140645, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8104, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.417351543024337, + "learning_rate": 8.754924574918675e-05, + "loss": 0.771, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.42762751595369664, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7527, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.5091564310729734, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7185, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.5191352153239585, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7472, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.39052374687232233, + "learning_rate": 8.343254300414628e-05, + "loss": 0.705, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.44439005828791617, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7689, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.49277191680042626, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7774, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.4762458485994561, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8411, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.47290284538716687, + "learning_rate": 7.934452157220694e-05, + "loss": 0.8232, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4726519028658229, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8124, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4203880375352218, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7598, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3941442723055963, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7341, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.5054516938068756, + "learning_rate": 7.52922585698315e-05, + "loss": 0.8368, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4243132536885088, + "learning_rate": 7.428560370317542e-05, + "loss": 0.7671, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4534177322388772, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7765, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.43927406680363684, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7457, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.44389773911521946, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7933, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.4039812656926671, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7207, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4428805100373204, + "learning_rate": 6.929623697981718e-05, + "loss": 0.8276, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4673188138115627, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7872, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.47265904688976484, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7749, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.42953052839076467, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7322, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.4723567917036063, + "learning_rate": 6.536389633212609e-05, + "loss": 0.8228, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.47214932017910977, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7907, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.49575133422589773, + "learning_rate": 6.341978999897365e-05, + "loss": 0.8083, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4971390236393325, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7912, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.46568071103960657, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7432, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.35950760162351136, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7595, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3852520082408446, + "learning_rate": 5.957991241184184e-05, + "loss": 0.719, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4144688519728944, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7519, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.574956448805601, + "learning_rate": 5.768580322118034e-05, + "loss": 0.8408, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4133163229719489, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7867, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.48036249996384844, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.8117, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4847266611954784, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7896, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4067581760586959, + "learning_rate": 5.395334294830765e-05, + "loss": 0.728, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4187734227054212, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7341, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.503269253418501, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7621, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.506259067716606, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.8065, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4241877143025232, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7265, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4062906237273913, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7286, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.4015033852678729, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7966, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4427337294004432, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7994, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.36670330249588673, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7539, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.46359869941524046, + "learning_rate": 4.585638567143529e-05, + "loss": 0.843, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5250059887349049, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7989, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4931760616740979, + "learning_rate": 4.411904525797408e-05, + "loss": 0.8151, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3855682951716813, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7323, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.4148347036245805, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7118, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.47022827966731523, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7955, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3918604326849458, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7425, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.40201106317085633, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6585, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4064711104467062, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7507, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.43230832543129016, + "learning_rate": 3.823367132865265e-05, + "loss": 0.8291, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.41086763348387045, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7254, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.41106326675945515, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.6617, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.36682224813291225, + "learning_rate": 3.580982498073344e-05, + "loss": 0.679, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.5166929327698213, + "learning_rate": 3.501565286440914e-05, + "loss": 0.7117, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.36736057328082905, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7006, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4151280117511326, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7469, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.4504996548534877, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7587, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.36058371335471157, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7505, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4300359809363015, + "learning_rate": 3.115196713638e-05, + "loss": 0.7914, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.4000285275376355, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7143, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.6151509764187009, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.8776, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.47029669638735594, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7602, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4206925243151734, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.678, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.45501137804658276, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7416, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.39642161326813075, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.721, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.38016310546539966, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7745, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.35166570303643224, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6992, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5630100003025338, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.9032, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.46440941920744416, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8255, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.5028963259761965, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7553, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5342454680747596, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7309, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.638561461721519, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7846, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3975459542376279, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.6635, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4324235174565061, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7236, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.5051293824930861, + "learning_rate": 2.008778270707944e-05, + "loss": 0.8115, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.44916051057375367, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7492, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.44819925713408015, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7944, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4955285294947814, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7128, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3593977013244792, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6488, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.3796390525191943, + "learning_rate": 1.707039794428259e-05, + "loss": 0.6952, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.4320687103990978, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6733, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3930783842804485, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6755, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.44459322981973176, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7601, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.367438558696493, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.6373, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.49124899058752686, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7743, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.471827562532131, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7304, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4361909599488475, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7808, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4267501395480655, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7604, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4183946493041821, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7939, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.3877019799044561, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.7391, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.47664260993902013, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.8213, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.43437766674982364, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7556, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.38702892913749193, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7246, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.38285741560545905, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7138, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.4355115545752378, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7514, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3825113651972068, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7203, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.40434287867385094, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7203, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4619415343823919, + "learning_rate": 8.117457353526625e-06, + "loss": 0.8128, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.49297866762546866, + "learning_rate": 7.711881868390291e-06, + "loss": 0.8996, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.39966759598042595, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7499, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.41912825742309595, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7187, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.48507379726989786, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7198, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.42079408254398726, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7445, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.40029250602009475, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7318, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.5352601493061019, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6953, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.4103055555089203, + "learning_rate": 5.154805790456485e-06, + "loss": 0.8103, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.50097946349166, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7665, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.40415390709399, + "learning_rate": 4.516017865659949e-06, + "loss": 0.7174, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.36190735541695723, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6558, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.4511855196083346, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7109, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.47199208951992744, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.8111, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.6014238116410946, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7794, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.528098105779227, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8053, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.37455150312706875, + "learning_rate": 2.848647830172024e-06, + "loss": 0.6943, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.433073283971249, + "learning_rate": 2.607383131993424e-06, + "loss": 0.8183, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.43399474629439905, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7625, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.46784582464558705, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7781, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4010880071391638, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6853, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4981044915202732, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.8253, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.37439237196850206, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7231, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4433273805623505, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7276, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.40452217918771727, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7221, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4150619811345609, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6725, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.413438773475868, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7608, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.4377068431797678, + "learning_rate": 7.781338686584927e-07, + "loss": 0.747, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5277822195393723, + "learning_rate": 6.539842600603918e-07, + "loss": 0.7971, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.43945048480422544, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7237, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4379035408937839, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7532, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4342441153115403, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7447, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.6481027348435237, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7571, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5314548036716706, + "learning_rate": 1.947230525005006e-07, + "loss": 0.903, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.40313175330133494, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7583, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.40735174719853, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6696, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.42449345279075135, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.6966, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.48329070024673454, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7644, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.40709003079455985, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6849, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4243713258025949, + "learning_rate": 0.0, + "loss": 0.6685, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 268297746972672.0, + "train_loss": 0.7987000615550921, + "train_runtime": 4814.0159, + "train_samples_per_second": 1.039, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 268297746972672.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1450b0c1c8fc11f7f719894e0c00265434e0f3f5 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "o_proj", + "k_proj", + "up_proj", + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b3f998b8e3b8566304856ad765e8aa63893fb368 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50772b939e1de0f2bf3823b30fb6656d983f334cb3b840955e81494b3bd8b1bb +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..b7eb4ea38e6d7b201b1aef80c4da749256eae7c7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6baf09c73d0dd685eb351304523c5f0b813e0ff2931735551313af2e9eb78143 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fe7ec2e7eb56566f968592e8f8564b4d6d7422a6 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,1134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064, + "grad_norm": 0.7852892419385117, + "learning_rate": 4e-05, + "loss": 1.2744, + "step": 1 + }, + { + "epoch": 0.0128, + "grad_norm": 0.8123301324016373, + "learning_rate": 8e-05, + "loss": 1.356, + "step": 2 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5966672278005135, + "learning_rate": 0.00012, + "loss": 1.1992, + "step": 3 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7054873167223312, + "learning_rate": 0.00016, + "loss": 1.1633, + "step": 4 + }, + { + "epoch": 0.032, + "grad_norm": 0.7603762104866024, + "learning_rate": 0.0002, + "loss": 1.0346, + "step": 5 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5885181651908423, + "learning_rate": 0.0001999783578606323, + "loss": 0.9551, + "step": 6 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5209000034448891, + "learning_rate": 0.0001999134408101731, + "loss": 0.9527, + "step": 7 + }, + { + "epoch": 0.0512, + "grad_norm": 0.6632635263946058, + "learning_rate": 0.00019980527694749952, + "loss": 0.9641, + "step": 8 + }, + { + "epoch": 0.0576, + "grad_norm": 0.3958275154400083, + "learning_rate": 0.0001996539130905593, + "loss": 0.8922, + "step": 9 + }, + { + "epoch": 0.064, + "grad_norm": 0.4482705445105819, + "learning_rate": 0.00019945941475610623, + "loss": 0.9568, + "step": 10 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4507926704898577, + "learning_rate": 0.0001992218661313415, + "loss": 0.931, + "step": 11 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5101526548814669, + "learning_rate": 0.00019894137003747403, + "loss": 0.9284, + "step": 12 + }, + { + "epoch": 0.0832, + "grad_norm": 0.38789254584648825, + "learning_rate": 0.00019861804788521493, + "loss": 0.8089, + "step": 13 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4622760386572833, + "learning_rate": 0.00019825203962222572, + "loss": 0.9155, + "step": 14 + }, + { + "epoch": 0.096, + "grad_norm": 0.3982764045802694, + "learning_rate": 0.00019784350367254322, + "loss": 0.9136, + "step": 15 + }, + { + "epoch": 0.1024, + "grad_norm": 0.40694387692825534, + "learning_rate": 0.0001973926168680066, + "loss": 0.9076, + "step": 16 + }, + { + "epoch": 0.1088, + "grad_norm": 0.3874800847072502, + "learning_rate": 0.0001968995743717171, + "loss": 0.8438, + "step": 17 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4451084313545965, + "learning_rate": 0.00019636458959356316, + "loss": 0.9307, + "step": 18 + }, + { + "epoch": 0.1216, + "grad_norm": 0.37554281242060195, + "learning_rate": 0.00019578789409784727, + "loss": 0.8146, + "step": 19 + }, + { + "epoch": 0.128, + "grad_norm": 0.38914086120007607, + "learning_rate": 0.00019516973750305532, + "loss": 0.8749, + "step": 20 + }, + { + "epoch": 0.1344, + "grad_norm": 0.36841739158575376, + "learning_rate": 0.00019451038737381077, + "loss": 0.8045, + "step": 21 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3659882630147638, + "learning_rate": 0.00019381012910506146, + "loss": 0.8193, + "step": 22 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3792566589980987, + "learning_rate": 0.00019306926579854821, + "loss": 0.8676, + "step": 23 + }, + { + "epoch": 0.1536, + "grad_norm": 0.3788031811405532, + "learning_rate": 0.0001922881181316097, + "loss": 0.8562, + "step": 24 + }, + { + "epoch": 0.16, + "grad_norm": 0.34891003346865745, + "learning_rate": 0.0001914670242183795, + "loss": 0.8594, + "step": 25 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3736972128807295, + "learning_rate": 0.0001906063394634356, + "loss": 0.8575, + "step": 26 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3334219023961695, + "learning_rate": 0.00018970643640796642, + "loss": 0.815, + "step": 27 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3537731834400865, + "learning_rate": 0.00018876770456851877, + "loss": 0.7869, + "step": 28 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3847612896751333, + "learning_rate": 0.00018779055026839868, + "loss": 0.8598, + "step": 29 + }, + { + "epoch": 0.192, + "grad_norm": 0.337348424209412, + "learning_rate": 0.00018677539646179707, + "loss": 0.7764, + "step": 30 + }, + { + "epoch": 0.1984, + "grad_norm": 0.35287051664332336, + "learning_rate": 0.00018572268255071718, + "loss": 0.8194, + "step": 31 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3363048601214956, + "learning_rate": 0.00018463286419478255, + "loss": 0.7572, + "step": 32 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3595884998735282, + "learning_rate": 0.00018350641311400812, + "loss": 0.8212, + "step": 33 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3589845049590864, + "learning_rate": 0.00018234381688461942, + "loss": 0.8351, + "step": 34 + }, + { + "epoch": 0.224, + "grad_norm": 0.32468876356527243, + "learning_rate": 0.00018114557872800905, + "loss": 0.7845, + "step": 35 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3282456892411546, + "learning_rate": 0.0001799122172929206, + "loss": 0.8261, + "step": 36 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3660913696816195, + "learning_rate": 0.0001786442664309554, + "loss": 0.8152, + "step": 37 + }, + { + "epoch": 0.2432, + "grad_norm": 0.36806893017265807, + "learning_rate": 0.0001773422749654988, + "loss": 0.8252, + "step": 38 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3456274494023383, + "learning_rate": 0.00017600680645416583, + "loss": 0.7714, + "step": 39 + }, + { + "epoch": 0.256, + "grad_norm": 0.335467839910172, + "learning_rate": 0.00017463843894486937, + "loss": 0.7899, + "step": 40 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3206294504592128, + "learning_rate": 0.00017323776472561627, + "loss": 0.7523, + "step": 41 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4305300228395664, + "learning_rate": 0.0001718053900681397, + "loss": 0.8279, + "step": 42 + }, + { + "epoch": 0.2752, + "grad_norm": 0.37912399345109443, + "learning_rate": 0.00017034193496547902, + "loss": 0.8045, + "step": 43 + }, + { + "epoch": 0.2816, + "grad_norm": 0.36339194511211986, + "learning_rate": 0.00016884803286362, + "loss": 0.797, + "step": 44 + }, + { + "epoch": 0.288, + "grad_norm": 0.5248417040381882, + "learning_rate": 0.00016732433038731242, + "loss": 0.8783, + "step": 45 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3488749810196314, + "learning_rate": 0.00016577148706018328, + "loss": 0.8035, + "step": 46 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3357970783896402, + "learning_rate": 0.00016419017501926656, + "loss": 0.8151, + "step": 47 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3480514063611181, + "learning_rate": 0.00016258107872407375, + "loss": 0.8203, + "step": 48 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3681154816060956, + "learning_rate": 0.00016094489466033043, + "loss": 0.8617, + "step": 49 + }, + { + "epoch": 0.32, + "grad_norm": 0.3358072390594222, + "learning_rate": 0.0001592823310385073, + "loss": 0.7818, + "step": 50 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3479737624601583, + "learning_rate": 0.00015759410748727662, + "loss": 0.8262, + "step": 51 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3316081314059503, + "learning_rate": 0.00015588095474202595, + "loss": 0.7526, + "step": 52 + }, + { + "epoch": 0.3392, + "grad_norm": 0.37147448547494943, + "learning_rate": 0.00015414361432856475, + "loss": 0.7727, + "step": 53 + }, + { + "epoch": 0.3456, + "grad_norm": 0.303285830093508, + "learning_rate": 0.00015238283824216015, + "loss": 0.7596, + "step": 54 + }, + { + "epoch": 0.352, + "grad_norm": 0.33875140638156126, + "learning_rate": 0.00015059938862204127, + "loss": 0.844, + "step": 55 + }, + { + "epoch": 0.3584, + "grad_norm": 0.34270498566617125, + "learning_rate": 0.00014879403742151283, + "loss": 0.7591, + "step": 56 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3538160657726156, + "learning_rate": 0.0001469675660738206, + "loss": 0.8131, + "step": 57 + }, + { + "epoch": 0.3712, + "grad_norm": 0.335452971166759, + "learning_rate": 0.00014512076515391375, + "loss": 0.7826, + "step": 58 + }, + { + "epoch": 0.3776, + "grad_norm": 0.35385521526738917, + "learning_rate": 0.0001432544340362501, + "loss": 0.8304, + "step": 59 + }, + { + "epoch": 0.384, + "grad_norm": 0.31921207731876666, + "learning_rate": 0.00014136938054879283, + "loss": 0.7903, + "step": 60 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3121946146036695, + "learning_rate": 0.00013946642062334766, + "loss": 0.7909, + "step": 61 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3038170983694278, + "learning_rate": 0.000137546377942393, + "loss": 0.7746, + "step": 62 + }, + { + "epoch": 0.4032, + "grad_norm": 0.35674244978824016, + "learning_rate": 0.00013561008358255468, + "loss": 0.7855, + "step": 63 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4191130455204281, + "learning_rate": 0.00013365837565488064, + "loss": 0.84, + "step": 64 + }, + { + "epoch": 0.416, + "grad_norm": 0.34596220764040214, + "learning_rate": 0.0001316920989420703, + "loss": 0.7996, + "step": 65 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3126095224863489, + "learning_rate": 0.00012971210453281674, + "loss": 0.7536, + "step": 66 + }, + { + "epoch": 0.4288, + "grad_norm": 0.30503266549923264, + "learning_rate": 0.00012771924945341906, + "loss": 0.7911, + "step": 67 + }, + { + "epoch": 0.4352, + "grad_norm": 0.31065634553624866, + "learning_rate": 0.0001257143962968246, + "loss": 0.7432, + "step": 68 + }, + { + "epoch": 0.4416, + "grad_norm": 0.2913333062839636, + "learning_rate": 0.00012369841284926188, + "loss": 0.7101, + "step": 69 + }, + { + "epoch": 0.448, + "grad_norm": 0.3054011831296645, + "learning_rate": 0.00012167217171462566, + "loss": 0.8268, + "step": 70 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3344018740472871, + "learning_rate": 0.00011963654993677645, + "loss": 0.7596, + "step": 71 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3800228897057949, + "learning_rate": 0.00011759242861991855, + "loss": 0.8404, + "step": 72 + }, + { + "epoch": 0.4672, + "grad_norm": 0.33273799548180377, + "learning_rate": 0.00011554069254722051, + "loss": 0.8014, + "step": 73 + }, + { + "epoch": 0.4736, + "grad_norm": 0.30404580113092516, + "learning_rate": 0.00011348222979784289, + "loss": 0.7902, + "step": 74 + }, + { + "epoch": 0.48, + "grad_norm": 0.3308579272219948, + "learning_rate": 0.00011141793136253986, + "loss": 0.7864, + "step": 75 + }, + { + "epoch": 0.4864, + "grad_norm": 0.37985174493832174, + "learning_rate": 0.000109348690758, + "loss": 0.8043, + "step": 76 + }, + { + "epoch": 0.4928, + "grad_norm": 0.33269596150791886, + "learning_rate": 0.0001072754036400944, + "loss": 0.7424, + "step": 77 + }, + { + "epoch": 0.4992, + "grad_norm": 0.2725402036547681, + "learning_rate": 0.00010519896741619803, + "loss": 0.7238, + "step": 78 + }, + { + "epoch": 0.5056, + "grad_norm": 0.2962381633437675, + "learning_rate": 0.00010312028085675391, + "loss": 0.7314, + "step": 79 + }, + { + "epoch": 0.512, + "grad_norm": 0.3256654465787873, + "learning_rate": 0.00010104024370624644, + "loss": 0.7824, + "step": 80 + }, + { + "epoch": 0.5184, + "grad_norm": 0.29941414975909786, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7469, + "step": 81 + }, + { + "epoch": 0.5248, + "grad_norm": 0.33592948893537333, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7677, + "step": 82 + }, + { + "epoch": 0.5312, + "grad_norm": 0.30615101882806117, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8134, + "step": 83 + }, + { + "epoch": 0.5376, + "grad_norm": 0.30955307869065124, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7459, + "step": 84 + }, + { + "epoch": 0.544, + "grad_norm": 0.3091065912061399, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7542, + "step": 85 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3169309045909451, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7851, + "step": 86 + }, + { + "epoch": 0.5568, + "grad_norm": 0.2923730971006948, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7591, + "step": 87 + }, + { + "epoch": 0.5632, + "grad_norm": 0.33526977572554184, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7355, + "step": 88 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3050894393762434, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7376, + "step": 89 + }, + { + "epoch": 0.576, + "grad_norm": 0.3445834949944767, + "learning_rate": 8.036345006322359e-05, + "loss": 0.811, + "step": 90 + }, + { + "epoch": 0.5824, + "grad_norm": 0.37838356459059586, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8188, + "step": 91 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3029282472334983, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7477, + "step": 92 + }, + { + "epoch": 0.5952, + "grad_norm": 0.34607185935788465, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8049, + "step": 93 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3311291086559154, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7643, + "step": 94 + }, + { + "epoch": 0.608, + "grad_norm": 0.31724066310492993, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7581, + "step": 95 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3209832103698408, + "learning_rate": 6.830790105792973e-05, + "loss": 0.8116, + "step": 96 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3187970820609727, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7537, + "step": 97 + }, + { + "epoch": 0.6272, + "grad_norm": 0.335328138754125, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8079, + "step": 98 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3592372773952761, + "learning_rate": 6.245362205760704e-05, + "loss": 0.803, + "step": 99 + }, + { + "epoch": 0.64, + "grad_norm": 0.3082878882980173, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7554, + "step": 100 + }, + { + "epoch": 0.6464, + "grad_norm": 0.29065935607242344, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7405, + "step": 101 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3627304498422207, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.815, + "step": 102 + }, + { + "epoch": 0.6592, + "grad_norm": 0.352388441861533, + "learning_rate": 5.487923484608629e-05, + "loss": 0.8045, + "step": 103 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3027712417529716, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7331, + "step": 104 + }, + { + "epoch": 0.672, + "grad_norm": 0.3766636606932217, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7868, + "step": 105 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3121526055286457, + "learning_rate": 4.940061137795876e-05, + "loss": 0.729, + "step": 106 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3149302698562098, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8027, + "step": 107 + }, + { + "epoch": 0.6912, + "grad_norm": 0.31556013440906944, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8025, + "step": 108 + }, + { + "epoch": 0.6976, + "grad_norm": 0.39623874251487173, + "learning_rate": 4.411904525797408e-05, + "loss": 0.8117, + "step": 109 + }, + { + "epoch": 0.704, + "grad_norm": 0.296852741991719, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7246, + "step": 110 + }, + { + "epoch": 0.7104, + "grad_norm": 0.32969648147687, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7727, + "step": 111 + }, + { + "epoch": 0.7168, + "grad_norm": 0.29024538079430984, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7107, + "step": 112 + }, + { + "epoch": 0.7232, + "grad_norm": 0.30756514170597327, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7812, + "step": 113 + }, + { + "epoch": 0.7296, + "grad_norm": 0.28057012148418414, + "learning_rate": 3.580982498073344e-05, + "loss": 0.6756, + "step": 114 + }, + { + "epoch": 0.736, + "grad_norm": 0.3128963704221699, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7137, + "step": 115 + }, + { + "epoch": 0.7424, + "grad_norm": 0.31033894256136946, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7585, + "step": 116 + }, + { + "epoch": 0.7488, + "grad_norm": 0.2894764903995195, + "learning_rate": 3.115196713638e-05, + "loss": 0.7783, + "step": 117 + }, + { + "epoch": 0.7552, + "grad_norm": 0.37786140371536453, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.8071, + "step": 118 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3198328748759536, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7239, + "step": 119 + }, + { + "epoch": 0.768, + "grad_norm": 0.30767614800890697, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7363, + "step": 120 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2712712834022847, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7436, + "step": 121 + }, + { + "epoch": 0.7808, + "grad_norm": 0.379576246128743, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8755, + "step": 122 + }, + { + "epoch": 0.7872, + "grad_norm": 0.36741356623291405, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7485, + "step": 123 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3946626921728505, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7333, + "step": 124 + }, + { + "epoch": 0.8, + "grad_norm": 0.340556183318827, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7736, + "step": 125 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3343972062233843, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7789, + "step": 126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3231610917304344, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6847, + "step": 127 + }, + { + "epoch": 0.8192, + "grad_norm": 0.2917533010139889, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6881, + "step": 128 + }, + { + "epoch": 0.8256, + "grad_norm": 0.30993215869001783, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7259, + "step": 129 + }, + { + "epoch": 0.832, + "grad_norm": 0.31379134473465536, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7155, + "step": 130 + }, + { + "epoch": 0.8384, + "grad_norm": 0.34094723704701696, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7619, + "step": 131 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3713863817615012, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7855, + "step": 132 + }, + { + "epoch": 0.8512, + "grad_norm": 0.31240753593609405, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.788, + "step": 133 + }, + { + "epoch": 0.8576, + "grad_norm": 0.30270241182524654, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.75, + "step": 134 + }, + { + "epoch": 0.864, + "grad_norm": 0.295655653102901, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7396, + "step": 135 + }, + { + "epoch": 0.8704, + "grad_norm": 0.28558293795235745, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7301, + "step": 136 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3520550476656149, + "learning_rate": 7.711881868390291e-06, + "loss": 0.8676, + "step": 137 + }, + { + "epoch": 0.8832, + "grad_norm": 0.29355394153082187, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7446, + "step": 138 + }, + { + "epoch": 0.8896, + "grad_norm": 0.45225443281815797, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7387, + "step": 139 + }, + { + "epoch": 0.896, + "grad_norm": 0.3417791223034832, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7224, + "step": 140 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3378646955827422, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7994, + "step": 141 + }, + { + "epoch": 0.9088, + "grad_norm": 0.31479109212880263, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6944, + "step": 142 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3296702583400903, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7694, + "step": 143 + }, + { + "epoch": 0.9216, + "grad_norm": 0.40151803179348267, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8046, + "step": 144 + }, + { + "epoch": 0.928, + "grad_norm": 0.3199316419603901, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7664, + "step": 145 + }, + { + "epoch": 0.9344, + "grad_norm": 0.32793118837673924, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7785, + "step": 146 + }, + { + "epoch": 0.9408, + "grad_norm": 0.316461221508605, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7653, + "step": 147 + }, + { + "epoch": 0.9472, + "grad_norm": 0.29821877779102307, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7334, + "step": 148 + }, + { + "epoch": 0.9536, + "grad_norm": 0.2975960096717064, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7078, + "step": 149 + }, + { + "epoch": 0.96, + "grad_norm": 0.3103506427392654, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7623, + "step": 150 + }, + { + "epoch": 0.9664, + "grad_norm": 0.39376025834993006, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7689, + "step": 151 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3136092190540982, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7549, + "step": 152 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3638008909987968, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8375, + "step": 153 + }, + { + "epoch": 0.9856, + "grad_norm": 0.27970864537532564, + "learning_rate": 8.655918982689581e-08, + "loss": 0.719, + "step": 154 + }, + { + "epoch": 0.992, + "grad_norm": 0.3423685826931903, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7382, + "step": 155 + }, + { + "epoch": 0.9984, + "grad_norm": 0.29697788326296587, + "learning_rate": 0.0, + "loss": 0.6871, + "step": 156 + }, + { + "epoch": 0.9984, + "step": 156, + "total_flos": 391611316568064.0, + "train_loss": 0.8044182276114439, + "train_runtime": 4779.7338, + "train_samples_per_second": 1.046, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 156, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 391611316568064.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1d50a2d4ff590d638f38beed373bfb60235a3bc8 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5c16ca5213690cdc7c54be8186bafbe97a9e1b2e --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:760853fdc0fb5b855c3eb934a37009087db4445d5b71b78ae90a33afe305b742 +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..ad1ecd815b74efe7c3e59f287532d27ba3bff7df --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4f67e42a4d1bde1c7ca898b2b55103bbbbbef502a896e2df1b374fd0e203f55 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f68f5c45b267cb39c080a3a4b6e96c1f158fc985 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9403164467491822, + "learning_rate": 2e-05, + "loss": 1.326, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8161387978103016, + "learning_rate": 4e-05, + "loss": 1.2227, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7903570972127051, + "learning_rate": 6e-05, + "loss": 1.2749, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7372082384164766, + "learning_rate": 8e-05, + "loss": 1.2725, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.647136454087788, + "learning_rate": 0.0001, + "loss": 0.9416, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.966111147557462, + "learning_rate": 0.00012, + "loss": 1.14, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8972547699219121, + "learning_rate": 0.00014, + "loss": 0.9772, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.9556489947629087, + "learning_rate": 0.00016, + "loss": 1.136, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6566249511477352, + "learning_rate": 0.00018, + "loss": 0.9801, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.6088085260386118, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.49451119398317356, + "learning_rate": 0.00019999458931878073, + "loss": 0.8984, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4947685304541711, + "learning_rate": 0.0001999783578606323, + "loss": 0.8886, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5592752857560825, + "learning_rate": 0.00019995130738201966, + "loss": 0.9013, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6421409188740781, + "learning_rate": 0.0001999134408101731, + "loss": 0.9336, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.6746215654926132, + "learning_rate": 0.00019986476224277165, + "loss": 0.9473, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.6261733851250406, + "learning_rate": 0.00019980527694749952, + "loss": 0.9251, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.556355660919846, + "learning_rate": 0.00019973499136147606, + "loss": 0.846, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5333528169937837, + "learning_rate": 0.0001996539130905593, + "loss": 0.8921, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5948344826803486, + "learning_rate": 0.0001995620509085228, + "loss": 1.0135, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.5050731254706105, + "learning_rate": 0.00019945941475610623, + "loss": 0.8385, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.49886787632077434, + "learning_rate": 0.0001993460157399396, + "loss": 0.9134, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4895502396158153, + "learning_rate": 0.0001992218661313415, + "loss": 0.8997, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5133303676414952, + "learning_rate": 0.00019908697936499103, + "loss": 0.9528, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5645289165267436, + "learning_rate": 0.00019894137003747403, + "loss": 0.8669, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.4771886806387586, + "learning_rate": 0.00019878505390570362, + "loss": 0.8335, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4785744263962845, + "learning_rate": 0.00019861804788521493, + "loss": 0.7468, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5817774914266006, + "learning_rate": 0.00019844037004833473, + "loss": 0.9147, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.48793648235873005, + "learning_rate": 0.00019825203962222572, + "loss": 0.8854, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.537552490827491, + "learning_rate": 0.0001980530769868059, + "loss": 0.8859, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.5220128682464678, + "learning_rate": 0.00019784350367254322, + "loss": 0.9127, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4380001036261557, + "learning_rate": 0.0001976233423581255, + "loss": 0.8505, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.6010189197957251, + "learning_rate": 0.0001973926168680066, + "loss": 0.9349, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.47811971240378376, + "learning_rate": 0.00019715135216982798, + "loss": 0.8648, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5132998311126512, + "learning_rate": 0.0001968995743717171, + "loss": 0.8049, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.5758538447495875, + "learning_rate": 0.00019663731071946206, + "loss": 0.9162, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.7230922488004969, + "learning_rate": 0.00019636458959356316, + "loss": 0.9278, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.7361533502274045, + "learning_rate": 0.0001960814405061619, + "loss": 0.8012, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.44135926098043454, + "learning_rate": 0.00019578789409784727, + "loss": 0.8142, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5139493655462305, + "learning_rate": 0.00019548398213434007, + "loss": 0.9134, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.448488097956857, + "learning_rate": 0.00019516973750305532, + "loss": 0.8364, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 1.000681699067436, + "learning_rate": 0.00019484519420954354, + "loss": 0.7512, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.6650851792362262, + "learning_rate": 0.00019451038737381077, + "loss": 0.8665, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5589585903785106, + "learning_rate": 0.00019416535322651818, + "loss": 0.7782, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5661529623212822, + "learning_rate": 0.00019381012910506146, + "loss": 0.8537, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.5326090290505959, + "learning_rate": 0.00019344475344953012, + "loss": 0.902, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5441834018302, + "learning_rate": 0.00019306926579854821, + "loss": 0.8372, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5592781197360509, + "learning_rate": 0.00019268370678499533, + "loss": 0.9005, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5055356622946707, + "learning_rate": 0.0001922881181316097, + "loss": 0.819, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4494087343581663, + "learning_rate": 0.00019188254264647337, + "loss": 0.8433, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.5275668960943449, + "learning_rate": 0.0001914670242183795, + "loss": 0.8789, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.6141329983651026, + "learning_rate": 0.0001910416078120832, + "loss": 0.8704, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.47629567533869904, + "learning_rate": 0.0001906063394634356, + "loss": 0.845, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.39833138970678816, + "learning_rate": 0.00019016126627440237, + "loss": 0.7865, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.6735185416773903, + "learning_rate": 0.00018970643640796642, + "loss": 0.8505, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.46919563547695103, + "learning_rate": 0.000189241899082916, + "loss": 0.7927, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4804315793841404, + "learning_rate": 0.00018876770456851877, + "loss": 0.786, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5548874913743832, + "learning_rate": 0.0001882839041790818, + "loss": 0.9485, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5111549241658514, + "learning_rate": 0.00018779055026839868, + "loss": 0.7729, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5034137626819486, + "learning_rate": 0.00018728769622408423, + "loss": 0.7852, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.4488108829174529, + "learning_rate": 0.00018677539646179707, + "loss": 0.7742, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.5188184982861153, + "learning_rate": 0.00018625370641935129, + "loss": 0.8249, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4365343224794975, + "learning_rate": 0.00018572268255071718, + "loss": 0.8146, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.45200806267614635, + "learning_rate": 0.00018518238231991218, + "loss": 0.8113, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.43789202866455496, + "learning_rate": 0.00018463286419478255, + "loss": 0.7082, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.5317837475598077, + "learning_rate": 0.00018407418764067627, + "loss": 0.8287, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4606302149091934, + "learning_rate": 0.00018350641311400812, + "loss": 0.8177, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4475068092534531, + "learning_rate": 0.0001829296020557174, + "loss": 0.8476, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5527007239826185, + "learning_rate": 0.00018234381688461942, + "loss": 0.8322, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.42955105675771005, + "learning_rate": 0.0001817491209906506, + "loss": 0.7941, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.5103288402468691, + "learning_rate": 0.00018114557872800905, + "loss": 0.7992, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.7192929984424484, + "learning_rate": 0.00018053325540819045, + "loss": 0.9048, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4309551951903274, + "learning_rate": 0.0001799122172929206, + "loss": 0.7482, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.524905218715058, + "learning_rate": 0.00017928253158698473, + "loss": 0.8054, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5183750960618888, + "learning_rate": 0.0001786442664309554, + "loss": 0.8357, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.4974939734697588, + "learning_rate": 0.0001779974908938184, + "loss": 0.8283, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4793129645486592, + "learning_rate": 0.0001773422749654988, + "loss": 0.8369, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.46697413868878285, + "learning_rate": 0.00017667868954928694, + "loss": 0.8139, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.47135290687461023, + "learning_rate": 0.00017600680645416583, + "loss": 0.7284, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4994984531684765, + "learning_rate": 0.00017532669838704035, + "loss": 0.8283, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.4171800017001343, + "learning_rate": 0.00017463843894486937, + "loss": 0.7675, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.42595149156295664, + "learning_rate": 0.0001739421026067017, + "loss": 0.7655, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.482499526895452, + "learning_rate": 0.00017323776472561627, + "loss": 0.746, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5259500639822324, + "learning_rate": 0.00017252550152056795, + "loss": 0.8482, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.6245585780675597, + "learning_rate": 0.0001718053900681397, + "loss": 0.8193, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.5027327496646725, + "learning_rate": 0.00017107750829420176, + "loss": 0.7842, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5114834099652439, + "learning_rate": 0.00017034193496547902, + "loss": 0.8276, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.43072609121932287, + "learning_rate": 0.00016959874968102735, + "loss": 0.7218, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.564639105906531, + "learning_rate": 0.00016884803286362, + "loss": 0.8817, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.604682876111702, + "learning_rate": 0.00016808986575104465, + "loss": 0.9515, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.48198587224648937, + "learning_rate": 0.00016732433038731242, + "loss": 0.8297, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.6003370287960951, + "learning_rate": 0.0001665515096137797, + "loss": 0.8569, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3842765293637917, + "learning_rate": 0.00016577148706018328, + "loss": 0.7724, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4002523327211069, + "learning_rate": 0.00016498434713559088, + "loss": 0.7597, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.48400032239723845, + "learning_rate": 0.00016419017501926656, + "loss": 0.878, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.5542732995154189, + "learning_rate": 0.0001633890566514535, + "loss": 0.8849, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4261167404490184, + "learning_rate": 0.00016258107872407375, + "loss": 0.7697, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5643656966798687, + "learning_rate": 0.0001617663286713474, + "loss": 0.9232, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4206152573714265, + "learning_rate": 0.00016094489466033043, + "loss": 0.8167, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4278369117950475, + "learning_rate": 0.00016011686558137448, + "loss": 0.8039, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.4309359068500558, + "learning_rate": 0.0001592823310385073, + "loss": 0.7743, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.40943336312292444, + "learning_rate": 0.0001584413813397364, + "loss": 0.8352, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.5359876660658226, + "learning_rate": 0.00015759410748727662, + "loss": 0.822, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4466159127813941, + "learning_rate": 0.00015674060116770236, + "loss": 0.6908, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.49271795050235634, + "learning_rate": 0.00015588095474202595, + "loss": 0.8305, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.5525175573731309, + "learning_rate": 0.00015501526123570277, + "loss": 0.8152, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.5210246095031947, + "learning_rate": 0.00015414361432856475, + "loss": 0.7472, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4420260279231557, + "learning_rate": 0.0001532661083446829, + "loss": 0.7831, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.6487323183372163, + "learning_rate": 0.00015238283824216015, + "loss": 0.7425, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4699987779256609, + "learning_rate": 0.00015149389960285558, + "loss": 0.8933, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.4592059242516297, + "learning_rate": 0.00015059938862204127, + "loss": 0.8075, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.42880936536633296, + "learning_rate": 0.00014969940209799248, + "loss": 0.7539, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.46857951495905303, + "learning_rate": 0.00014879403742151283, + "loss": 0.7717, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.5012514304043896, + "learning_rate": 0.00014788339256539544, + "loss": 0.8425, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.479922785247191, + "learning_rate": 0.0001469675660738206, + "loss": 0.8044, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.4661734763422208, + "learning_rate": 0.00014604665705169237, + "loss": 0.8304, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.46281454027918595, + "learning_rate": 0.00014512076515391375, + "loss": 0.7486, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.43443054674776366, + "learning_rate": 0.00014418999057460276, + "loss": 0.7757, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5549461110214192, + "learning_rate": 0.0001432544340362501, + "loss": 0.8918, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.450064419920354, + "learning_rate": 0.00014231419677881966, + "loss": 0.7735, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.4380240394994644, + "learning_rate": 0.00014136938054879283, + "loss": 0.8189, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4455704564544761, + "learning_rate": 0.00014042008758815818, + "loss": 0.7848, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4265943986788346, + "learning_rate": 0.00013946642062334766, + "loss": 0.8118, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4387296129649349, + "learning_rate": 0.00013850848285411994, + "loss": 0.7984, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4082806194667678, + "learning_rate": 0.000137546377942393, + "loss": 0.7666, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.397887808109905, + "learning_rate": 0.00013658021000102636, + "loss": 0.7781, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5063382696810116, + "learning_rate": 0.00013561008358255468, + "loss": 0.8107, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.5402999745501336, + "learning_rate": 0.00013463610366787392, + "loss": 0.8604, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.5511794660615774, + "learning_rate": 0.00013365837565488064, + "loss": 0.8264, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4049254874424734, + "learning_rate": 0.0001326770053470668, + "loss": 0.7375, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.6030364224349011, + "learning_rate": 0.0001316920989420703, + "loss": 0.8861, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.5268547885056347, + "learning_rate": 0.00013070376302018287, + "loss": 0.7992, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4038438055064155, + "learning_rate": 0.00012971210453281674, + "loss": 0.7215, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.5467774429109289, + "learning_rate": 0.000128717230790931, + "loss": 0.8325, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4181719546030265, + "learning_rate": 0.00012771924945341906, + "loss": 0.7641, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.46070085076217804, + "learning_rate": 0.00012671826851545851, + "loss": 0.7321, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4580405416596679, + "learning_rate": 0.0001257143962968246, + "loss": 0.7703, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.40519612037378994, + "learning_rate": 0.00012470774143016853, + "loss": 0.7263, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.41765952251996086, + "learning_rate": 0.00012369841284926188, + "loss": 0.7021, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3745934515144171, + "learning_rate": 0.00012268651977720866, + "loss": 0.7374, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.5127091782649509, + "learning_rate": 0.00012167217171462566, + "loss": 0.9169, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.48491962580869874, + "learning_rate": 0.0001206554784277931, + "loss": 0.7599, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.49383390158049373, + "learning_rate": 0.00011963654993677645, + "loss": 0.7598, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5197953411631441, + "learning_rate": 0.00011861549650352069, + "loss": 0.8641, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5454779674561641, + "learning_rate": 0.00011759242861991855, + "loss": 0.8128, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.4313475681156491, + "learning_rate": 0.00011656745699585371, + "loss": 0.7707, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.5132205432582659, + "learning_rate": 0.00011554069254722051, + "loss": 0.8376, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3982886336613627, + "learning_rate": 0.00011451224638392129, + "loss": 0.7012, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.44730572163431054, + "learning_rate": 0.00011348222979784289, + "loss": 0.8852, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.48304105246350115, + "learning_rate": 0.00011245075425081328, + "loss": 0.7548, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.43379715260938845, + "learning_rate": 0.00011141793136253986, + "loss": 0.8249, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3722269585690899, + "learning_rate": 0.0001103838728985307, + "loss": 0.7763, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.6015833868175456, + "learning_rate": 0.000109348690758, + "loss": 0.8335, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.45155148311203713, + "learning_rate": 0.00010831249696175918, + "loss": 0.7608, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4323587816287022, + "learning_rate": 0.0001072754036400944, + "loss": 0.7324, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.39303924688851216, + "learning_rate": 0.00010623752302063283, + "loss": 0.7209, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.43511898322913245, + "learning_rate": 0.00010519896741619803, + "loss": 0.7365, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4498897672241139, + "learning_rate": 0.00010415984921265609, + "loss": 0.7935, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.47264276687337253, + "learning_rate": 0.00010312028085675391, + "loss": 0.6799, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.509864229696816, + "learning_rate": 0.00010208037484395114, + "loss": 0.8467, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.45543437884234866, + "learning_rate": 0.00010104024370624644, + "loss": 0.7217, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.44808316462676906, + "learning_rate": 0.0001, + "loss": 0.7794, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.35941205649478886, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7158, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.45781756180347105, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7577, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4379183406476885, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7761, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.4190665599840034, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8133, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4362902610839825, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8272, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4151590081592175, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7042, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.45458142730658896, + "learning_rate": 9.272459635990562e-05, + "loss": 0.8005, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.4458988263161724, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7927, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.42688999981693, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7168, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.48919574950425926, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7535, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.45653322268010754, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8149, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4572002897224282, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7722, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.40101980297192197, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7546, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.48413738730953726, + "learning_rate": 8.548775361607872e-05, + "loss": 0.717, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.42046000647002596, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7465, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3975721995046092, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7072, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.44807879727459876, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7657, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4340176534043155, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7796, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.5286008620789443, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8444, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4898308649785636, + "learning_rate": 7.934452157220694e-05, + "loss": 0.8207, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.47228683476043576, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8161, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.445910528188009, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7627, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.5839084167816913, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7343, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.5419196487076774, + "learning_rate": 7.52922585698315e-05, + "loss": 0.8361, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.434670829219984, + "learning_rate": 7.428560370317542e-05, + "loss": 0.7691, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.46226934049123536, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7731, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4331061643516525, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7479, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.454834956540033, + "learning_rate": 7.1282769209069e-05, + "loss": 0.793, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.4121086759968263, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7219, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.45158172523113455, + "learning_rate": 6.929623697981718e-05, + "loss": 0.8289, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4263714476961183, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7898, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.5238655375104982, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7748, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.42185923206259196, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7328, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.441024424461854, + "learning_rate": 6.536389633212609e-05, + "loss": 0.825, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.49032435666200364, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7963, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5039392484271568, + "learning_rate": 6.341978999897365e-05, + "loss": 0.8126, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.49666330579733103, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7889, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.45539269569468943, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7439, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3681914738253727, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7601, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3879745595310054, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7215, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.559525163793432, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7523, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.5639402939049178, + "learning_rate": 5.768580322118034e-05, + "loss": 0.8384, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.42349218652727627, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7876, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.49184743437053013, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.8181, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5089572330169558, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7941, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.419386781205736, + "learning_rate": 5.395334294830765e-05, + "loss": 0.7303, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4816466559542788, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.732, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.5222937607988991, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7668, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.5027928990810613, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.8038, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4220864972323868, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7285, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4176007443523416, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7271, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.45515802194139304, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7978, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4577351921329997, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7971, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3790064243314445, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7556, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4983988032374861, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8468, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5379087413391773, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7967, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.5147776717401932, + "learning_rate": 4.411904525797408e-05, + "loss": 0.8165, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.6249784194430409, + "learning_rate": 4.325939883229766e-05, + "loss": 0.732, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.41001673871650374, + "learning_rate": 4.240589251272342e-05, + "loss": 0.712, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.625582050090508, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7917, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.39828725874660814, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7441, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3819450698806339, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6571, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3990032101593105, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7529, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.4350507667639973, + "learning_rate": 3.823367132865265e-05, + "loss": 0.8304, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3978367820930633, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7225, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.40257043569078077, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.6622, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3700010193562475, + "learning_rate": 3.580982498073344e-05, + "loss": 0.6825, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.469112884691295, + "learning_rate": 3.501565286440914e-05, + "loss": 0.7108, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.3645240671438613, + "learning_rate": 3.422851293981676e-05, + "loss": 0.6997, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.43547382362805426, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7498, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5039311910226827, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7603, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.36654507387112795, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7533, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4369475913914052, + "learning_rate": 3.115196713638e-05, + "loss": 0.7923, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.4053219846239178, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7163, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.6110807512044899, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.8785, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4700108092984455, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7624, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.430086120412855, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.6837, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4471230873786409, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.743, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.38842437757365395, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7212, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3880894085413885, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7728, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3550252609641288, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6983, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5742894720748866, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.9021, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4756840750985434, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8286, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.4746175037861587, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.755, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5305745533741653, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7264, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.6745686774384193, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7858, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3991822202289771, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.6646, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4300627097278966, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7243, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.6221113967090829, + "learning_rate": 2.008778270707944e-05, + "loss": 0.8133, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4396383168090041, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7512, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4322242291580685, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7927, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4881464046761018, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7112, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3635967883127235, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6488, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.9162631115013509, + "learning_rate": 1.707039794428259e-05, + "loss": 0.6927, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.4232183438208216, + "learning_rate": 1.649358688599191e-05, + "loss": 0.675, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3983414780249098, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6749, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.43736649086026275, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7566, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.38352977687355905, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.638, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.4929938245776581, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7796, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.47575598281648185, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7288, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.44743746064919565, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7818, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4277467149482736, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.761, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4111108176793066, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7931, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.38627127454224836, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.7401, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4865842859255661, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.8224, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.43387943324067996, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7584, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3877846450634093, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7267, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3751685403252838, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7147, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.43956537906372245, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7505, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.39081466236870016, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7204, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.40751146184111137, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7196, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.5113127556918922, + "learning_rate": 8.117457353526625e-06, + "loss": 0.8143, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.49028884149407453, + "learning_rate": 7.711881868390291e-06, + "loss": 0.901, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.3963954743021353, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.749, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.43387737277821176, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7181, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.4257288320928217, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7162, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.45149432731324496, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7434, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.39795054274931096, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7313, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.5787298763731419, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6926, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.41569392614976874, + "learning_rate": 5.154805790456485e-06, + "loss": 0.8113, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.5286309729383197, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7696, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.41586498970579483, + "learning_rate": 4.516017865659949e-06, + "loss": 0.7173, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.40098365613370385, + "learning_rate": 4.21210590215273e-06, + "loss": 0.654, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.45275196123906825, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7096, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4704592086843827, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.8154, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.6155616558939484, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7761, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5371537043394504, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8077, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3704992794499012, + "learning_rate": 2.848647830172024e-06, + "loss": 0.6935, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.43499353149792624, + "learning_rate": 2.607383131993424e-06, + "loss": 0.8202, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.4404320324404543, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7615, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4656663216499453, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7776, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.37652168273299075, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6871, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4839018460466476, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.8256, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.37824459079415657, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7259, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.45051573211287965, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7285, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.39407823652788115, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7209, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.44125661559905305, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6752, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.41703806836884816, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7589, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.44452416210889956, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7459, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5897734862202882, + "learning_rate": 6.539842600603918e-07, + "loss": 0.7962, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.47808881340202486, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7262, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4422678402673698, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7511, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.44324451607132576, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7456, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.5271824627724633, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7581, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5194352897373036, + "learning_rate": 1.947230525005006e-07, + "loss": 0.9021, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.39981380158689, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7602, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4864393541236497, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6679, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.44453238041721077, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.702, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.49408675694214305, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7702, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.42048527475207637, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6854, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.43465444254337887, + "learning_rate": 0.0, + "loss": 0.6674, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 268297746972672.0, + "train_loss": 0.799005626867979, + "train_runtime": 4811.5388, + "train_samples_per_second": 1.039, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 268297746972672.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..25928ca95c5fd6abf34f0d48986c6bab205e802c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "v_proj", + "k_proj", + "down_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..389fb1b8eee43f98021b7a0f22821aa2e5049075 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7530b7382ce3933dd24c6111408fd392a68b67efc64148e67fc49598d846871a +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..a73d0d68280885eaf1500b8af9523f36b09f8789 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31a054c6255fa4ba538f90a94104809531922320e7d190502cc2784e2efc5194 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c31db710bff612687ef4e2b1d2fb7913a00f2beb --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_2_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,1134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064, + "grad_norm": 0.8035340793505722, + "learning_rate": 4e-05, + "loss": 1.2744, + "step": 1 + }, + { + "epoch": 0.0128, + "grad_norm": 0.8326815210726537, + "learning_rate": 8e-05, + "loss": 1.356, + "step": 2 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6048062185696337, + "learning_rate": 0.00012, + "loss": 1.1985, + "step": 3 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7065372561442537, + "learning_rate": 0.00016, + "loss": 1.1624, + "step": 4 + }, + { + "epoch": 0.032, + "grad_norm": 0.791038287770007, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 5 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5926212931857436, + "learning_rate": 0.0001999783578606323, + "loss": 0.9565, + "step": 6 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4961935514321317, + "learning_rate": 0.0001999134408101731, + "loss": 0.9528, + "step": 7 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5927044662113193, + "learning_rate": 0.00019980527694749952, + "loss": 0.9635, + "step": 8 + }, + { + "epoch": 0.0576, + "grad_norm": 0.41203424704440733, + "learning_rate": 0.0001996539130905593, + "loss": 0.894, + "step": 9 + }, + { + "epoch": 0.064, + "grad_norm": 0.48187851159803763, + "learning_rate": 0.00019945941475610623, + "loss": 0.9567, + "step": 10 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4577696835046481, + "learning_rate": 0.0001992218661313415, + "loss": 0.931, + "step": 11 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5392759187031103, + "learning_rate": 0.00019894137003747403, + "loss": 0.9288, + "step": 12 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3962306342126496, + "learning_rate": 0.00019861804788521493, + "loss": 0.8091, + "step": 13 + }, + { + "epoch": 0.0896, + "grad_norm": 0.45163495803519205, + "learning_rate": 0.00019825203962222572, + "loss": 0.9157, + "step": 14 + }, + { + "epoch": 0.096, + "grad_norm": 0.4039255911085676, + "learning_rate": 0.00019784350367254322, + "loss": 0.9128, + "step": 15 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4144274624343591, + "learning_rate": 0.0001973926168680066, + "loss": 0.907, + "step": 16 + }, + { + "epoch": 0.1088, + "grad_norm": 0.39407070351038004, + "learning_rate": 0.0001968995743717171, + "loss": 0.8452, + "step": 17 + }, + { + "epoch": 0.1152, + "grad_norm": 0.43548916333524695, + "learning_rate": 0.00019636458959356316, + "loss": 0.9298, + "step": 18 + }, + { + "epoch": 0.1216, + "grad_norm": 0.38724088031902365, + "learning_rate": 0.00019578789409784727, + "loss": 0.8181, + "step": 19 + }, + { + "epoch": 0.128, + "grad_norm": 0.3811996227647024, + "learning_rate": 0.00019516973750305532, + "loss": 0.8756, + "step": 20 + }, + { + "epoch": 0.1344, + "grad_norm": 0.3720832384145608, + "learning_rate": 0.00019451038737381077, + "loss": 0.8054, + "step": 21 + }, + { + "epoch": 0.1408, + "grad_norm": 0.37162449995068403, + "learning_rate": 0.00019381012910506146, + "loss": 0.8206, + "step": 22 + }, + { + "epoch": 0.1472, + "grad_norm": 0.37721195841803645, + "learning_rate": 0.00019306926579854821, + "loss": 0.8668, + "step": 23 + }, + { + "epoch": 0.1536, + "grad_norm": 0.3781960709473528, + "learning_rate": 0.0001922881181316097, + "loss": 0.8555, + "step": 24 + }, + { + "epoch": 0.16, + "grad_norm": 0.34823275392631015, + "learning_rate": 0.0001914670242183795, + "loss": 0.8595, + "step": 25 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3697283311475832, + "learning_rate": 0.0001906063394634356, + "loss": 0.8577, + "step": 26 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3279726724431966, + "learning_rate": 0.00018970643640796642, + "loss": 0.8154, + "step": 27 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3361144174375379, + "learning_rate": 0.00018876770456851877, + "loss": 0.787, + "step": 28 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3789000717657513, + "learning_rate": 0.00018779055026839868, + "loss": 0.8598, + "step": 29 + }, + { + "epoch": 0.192, + "grad_norm": 0.33683848816432094, + "learning_rate": 0.00018677539646179707, + "loss": 0.7757, + "step": 30 + }, + { + "epoch": 0.1984, + "grad_norm": 0.35834671597383877, + "learning_rate": 0.00018572268255071718, + "loss": 0.8197, + "step": 31 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3326729551946847, + "learning_rate": 0.00018463286419478255, + "loss": 0.7574, + "step": 32 + }, + { + "epoch": 0.2112, + "grad_norm": 0.34995163009170244, + "learning_rate": 0.00018350641311400812, + "loss": 0.8209, + "step": 33 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3639572220576174, + "learning_rate": 0.00018234381688461942, + "loss": 0.8355, + "step": 34 + }, + { + "epoch": 0.224, + "grad_norm": 0.32212467900780717, + "learning_rate": 0.00018114557872800905, + "loss": 0.7841, + "step": 35 + }, + { + "epoch": 0.2304, + "grad_norm": 0.32501947399021053, + "learning_rate": 0.0001799122172929206, + "loss": 0.8259, + "step": 36 + }, + { + "epoch": 0.2368, + "grad_norm": 0.38201116102721167, + "learning_rate": 0.0001786442664309554, + "loss": 0.8169, + "step": 37 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3626434293467508, + "learning_rate": 0.0001773422749654988, + "loss": 0.8258, + "step": 38 + }, + { + "epoch": 0.2496, + "grad_norm": 0.34313519341303317, + "learning_rate": 0.00017600680645416583, + "loss": 0.7715, + "step": 39 + }, + { + "epoch": 0.256, + "grad_norm": 0.3275583265760916, + "learning_rate": 0.00017463843894486937, + "loss": 0.79, + "step": 40 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3191062587367316, + "learning_rate": 0.00017323776472561627, + "loss": 0.7519, + "step": 41 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4092994506893867, + "learning_rate": 0.0001718053900681397, + "loss": 0.8274, + "step": 42 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3695189858901441, + "learning_rate": 0.00017034193496547902, + "loss": 0.806, + "step": 43 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3545162554280589, + "learning_rate": 0.00016884803286362, + "loss": 0.7963, + "step": 44 + }, + { + "epoch": 0.288, + "grad_norm": 0.4324311848015332, + "learning_rate": 0.00016732433038731242, + "loss": 0.88, + "step": 45 + }, + { + "epoch": 0.2944, + "grad_norm": 0.35465287211803115, + "learning_rate": 0.00016577148706018328, + "loss": 0.804, + "step": 46 + }, + { + "epoch": 0.3008, + "grad_norm": 0.31942958758251727, + "learning_rate": 0.00016419017501926656, + "loss": 0.8166, + "step": 47 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3365577635582448, + "learning_rate": 0.00016258107872407375, + "loss": 0.8196, + "step": 48 + }, + { + "epoch": 0.3136, + "grad_norm": 0.36006440509046755, + "learning_rate": 0.00016094489466033043, + "loss": 0.8598, + "step": 49 + }, + { + "epoch": 0.32, + "grad_norm": 0.3306811147622879, + "learning_rate": 0.0001592823310385073, + "loss": 0.783, + "step": 50 + }, + { + "epoch": 0.3264, + "grad_norm": 0.34268843518010145, + "learning_rate": 0.00015759410748727662, + "loss": 0.8273, + "step": 51 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3158941962722818, + "learning_rate": 0.00015588095474202595, + "loss": 0.7514, + "step": 52 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3731863427097849, + "learning_rate": 0.00015414361432856475, + "loss": 0.7711, + "step": 53 + }, + { + "epoch": 0.3456, + "grad_norm": 0.31209125072987703, + "learning_rate": 0.00015238283824216015, + "loss": 0.76, + "step": 54 + }, + { + "epoch": 0.352, + "grad_norm": 0.3448081778380014, + "learning_rate": 0.00015059938862204127, + "loss": 0.8421, + "step": 55 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3451816731920408, + "learning_rate": 0.00014879403742151283, + "loss": 0.7579, + "step": 56 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3840857009752893, + "learning_rate": 0.0001469675660738206, + "loss": 0.8128, + "step": 57 + }, + { + "epoch": 0.3712, + "grad_norm": 0.33758751388577785, + "learning_rate": 0.00014512076515391375, + "loss": 0.7834, + "step": 58 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3695050281343937, + "learning_rate": 0.0001432544340362501, + "loss": 0.8295, + "step": 59 + }, + { + "epoch": 0.384, + "grad_norm": 0.3346439954641722, + "learning_rate": 0.00014136938054879283, + "loss": 0.7894, + "step": 60 + }, + { + "epoch": 0.3904, + "grad_norm": 0.31930767512743163, + "learning_rate": 0.00013946642062334766, + "loss": 0.7925, + "step": 61 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3019269522549601, + "learning_rate": 0.000137546377942393, + "loss": 0.7783, + "step": 62 + }, + { + "epoch": 0.4032, + "grad_norm": 0.34031256857467895, + "learning_rate": 0.00013561008358255468, + "loss": 0.7858, + "step": 63 + }, + { + "epoch": 0.4096, + "grad_norm": 0.42760295970656664, + "learning_rate": 0.00013365837565488064, + "loss": 0.8397, + "step": 64 + }, + { + "epoch": 0.416, + "grad_norm": 0.3606486382014007, + "learning_rate": 0.0001316920989420703, + "loss": 0.8028, + "step": 65 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3159985053530957, + "learning_rate": 0.00012971210453281674, + "loss": 0.7535, + "step": 66 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3154305870908761, + "learning_rate": 0.00012771924945341906, + "loss": 0.7915, + "step": 67 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3291007330811774, + "learning_rate": 0.0001257143962968246, + "loss": 0.7444, + "step": 68 + }, + { + "epoch": 0.4416, + "grad_norm": 0.29839013247373436, + "learning_rate": 0.00012369841284926188, + "loss": 0.7099, + "step": 69 + }, + { + "epoch": 0.448, + "grad_norm": 0.31924199456699764, + "learning_rate": 0.00012167217171462566, + "loss": 0.8262, + "step": 70 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3548228413411494, + "learning_rate": 0.00011963654993677645, + "loss": 0.7585, + "step": 71 + }, + { + "epoch": 0.4608, + "grad_norm": 0.40046011439240137, + "learning_rate": 0.00011759242861991855, + "loss": 0.8403, + "step": 72 + }, + { + "epoch": 0.4672, + "grad_norm": 0.33654136487192543, + "learning_rate": 0.00011554069254722051, + "loss": 0.801, + "step": 73 + }, + { + "epoch": 0.4736, + "grad_norm": 0.31460927305651093, + "learning_rate": 0.00011348222979784289, + "loss": 0.7911, + "step": 74 + }, + { + "epoch": 0.48, + "grad_norm": 0.3549394122392794, + "learning_rate": 0.00011141793136253986, + "loss": 0.7862, + "step": 75 + }, + { + "epoch": 0.4864, + "grad_norm": 0.38241670589324783, + "learning_rate": 0.000109348690758, + "loss": 0.8062, + "step": 76 + }, + { + "epoch": 0.4928, + "grad_norm": 0.34489555449239506, + "learning_rate": 0.0001072754036400944, + "loss": 0.7434, + "step": 77 + }, + { + "epoch": 0.4992, + "grad_norm": 0.2806231949464588, + "learning_rate": 0.00010519896741619803, + "loss": 0.7229, + "step": 78 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3000699983392725, + "learning_rate": 0.00010312028085675391, + "loss": 0.7321, + "step": 79 + }, + { + "epoch": 0.512, + "grad_norm": 0.33595212091928367, + "learning_rate": 0.00010104024370624644, + "loss": 0.7825, + "step": 80 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2995860860695928, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7472, + "step": 81 + }, + { + "epoch": 0.5248, + "grad_norm": 0.33327316911841876, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7664, + "step": 82 + }, + { + "epoch": 0.5312, + "grad_norm": 0.364902553727858, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8141, + "step": 83 + }, + { + "epoch": 0.5376, + "grad_norm": 0.30391859609685723, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7451, + "step": 84 + }, + { + "epoch": 0.544, + "grad_norm": 0.30695551868865256, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7542, + "step": 85 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3227497614839023, + "learning_rate": 8.858206863746018e-05, + "loss": 0.785, + "step": 86 + }, + { + "epoch": 0.5568, + "grad_norm": 0.2990974753627751, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7599, + "step": 87 + }, + { + "epoch": 0.5632, + "grad_norm": 0.32988199154175873, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7347, + "step": 88 + }, + { + "epoch": 0.5696, + "grad_norm": 0.32116937919851646, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7402, + "step": 89 + }, + { + "epoch": 0.576, + "grad_norm": 0.3398716441196792, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8113, + "step": 90 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3701808895751357, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8209, + "step": 91 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3172929447749312, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7494, + "step": 92 + }, + { + "epoch": 0.5952, + "grad_norm": 0.36440455545982475, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8047, + "step": 93 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3782297199151222, + "learning_rate": 7.228075054658096e-05, + "loss": 0.764, + "step": 94 + }, + { + "epoch": 0.608, + "grad_norm": 0.3302517040045045, + "learning_rate": 7.028789546718326e-05, + "loss": 0.757, + "step": 95 + }, + { + "epoch": 0.6144, + "grad_norm": 0.31925075939475256, + "learning_rate": 6.830790105792973e-05, + "loss": 0.8109, + "step": 96 + }, + { + "epoch": 0.6208, + "grad_norm": 0.34597884431932663, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7528, + "step": 97 + }, + { + "epoch": 0.6272, + "grad_norm": 0.34399804653158134, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8112, + "step": 98 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4503304168982117, + "learning_rate": 6.245362205760704e-05, + "loss": 0.8019, + "step": 99 + }, + { + "epoch": 0.64, + "grad_norm": 0.31408882640451896, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7539, + "step": 100 + }, + { + "epoch": 0.6464, + "grad_norm": 0.2950655274806464, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7406, + "step": 101 + }, + { + "epoch": 0.6528, + "grad_norm": 0.36816918991595826, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8141, + "step": 102 + }, + { + "epoch": 0.6592, + "grad_norm": 0.36209149313356365, + "learning_rate": 5.487923484608629e-05, + "loss": 0.8046, + "step": 103 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3134766251824321, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7352, + "step": 104 + }, + { + "epoch": 0.672, + "grad_norm": 0.48296319857057174, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7882, + "step": 105 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3160209673487955, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7291, + "step": 106 + }, + { + "epoch": 0.6848, + "grad_norm": 0.31709343856767247, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8039, + "step": 107 + }, + { + "epoch": 0.6912, + "grad_norm": 0.31063930635038317, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8023, + "step": 108 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3956030948962467, + "learning_rate": 4.411904525797408e-05, + "loss": 0.8113, + "step": 109 + }, + { + "epoch": 0.704, + "grad_norm": 0.3123712643671273, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7253, + "step": 110 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3482739347672168, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7742, + "step": 111 + }, + { + "epoch": 0.7168, + "grad_norm": 0.2863058456033559, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7096, + "step": 112 + }, + { + "epoch": 0.7232, + "grad_norm": 0.302090802722911, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7803, + "step": 113 + }, + { + "epoch": 0.7296, + "grad_norm": 0.28436088477268534, + "learning_rate": 3.580982498073344e-05, + "loss": 0.6765, + "step": 114 + }, + { + "epoch": 0.736, + "grad_norm": 0.3126965341963825, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7144, + "step": 115 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3180784275437213, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7593, + "step": 116 + }, + { + "epoch": 0.7488, + "grad_norm": 0.2978419987936676, + "learning_rate": 3.115196713638e-05, + "loss": 0.7781, + "step": 117 + }, + { + "epoch": 0.7552, + "grad_norm": 0.41255804552728154, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.8066, + "step": 118 + }, + { + "epoch": 0.7616, + "grad_norm": 0.32837895155628116, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7248, + "step": 119 + }, + { + "epoch": 0.768, + "grad_norm": 0.32726215856422297, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7365, + "step": 120 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2739476755750894, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7443, + "step": 121 + }, + { + "epoch": 0.7808, + "grad_norm": 0.37548914623504875, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8744, + "step": 122 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3604271713813052, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7482, + "step": 123 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4060539170115801, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7333, + "step": 124 + }, + { + "epoch": 0.8, + "grad_norm": 0.34529267555652776, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7743, + "step": 125 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3390417346838904, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7796, + "step": 126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.33150772182289123, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6846, + "step": 127 + }, + { + "epoch": 0.8192, + "grad_norm": 0.29328252821956335, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6885, + "step": 128 + }, + { + "epoch": 0.8256, + "grad_norm": 0.311185954863508, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7278, + "step": 129 + }, + { + "epoch": 0.832, + "grad_norm": 0.3285936074265703, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7144, + "step": 130 + }, + { + "epoch": 0.8384, + "grad_norm": 0.33971634630816916, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7638, + "step": 131 + }, + { + "epoch": 0.8448, + "grad_norm": 0.2978291588736038, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7856, + "step": 132 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3133959872428493, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7897, + "step": 133 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3013524652462147, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7488, + "step": 134 + }, + { + "epoch": 0.864, + "grad_norm": 0.2944488340362683, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7387, + "step": 135 + }, + { + "epoch": 0.8704, + "grad_norm": 0.502661096719573, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7307, + "step": 136 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3480637990631335, + "learning_rate": 7.711881868390291e-06, + "loss": 0.8673, + "step": 137 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3030909147874621, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7444, + "step": 138 + }, + { + "epoch": 0.8896, + "grad_norm": 0.315645662045372, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7382, + "step": 139 + }, + { + "epoch": 0.896, + "grad_norm": 0.3495253840763391, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7223, + "step": 140 + }, + { + "epoch": 0.9024, + "grad_norm": 0.41048950032203585, + "learning_rate": 4.830262496944693e-06, + "loss": 0.802, + "step": 141 + }, + { + "epoch": 0.9088, + "grad_norm": 0.2839975051047652, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6948, + "step": 142 + }, + { + "epoch": 0.9152, + "grad_norm": 0.41928555434574294, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7693, + "step": 143 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4122207518174965, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8077, + "step": 144 + }, + { + "epoch": 0.928, + "grad_norm": 0.29709260516698854, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7667, + "step": 145 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4301082952564468, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.778, + "step": 146 + }, + { + "epoch": 0.9408, + "grad_norm": 0.32064610401192734, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7654, + "step": 147 + }, + { + "epoch": 0.9472, + "grad_norm": 0.2986447331282153, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.732, + "step": 148 + }, + { + "epoch": 0.9536, + "grad_norm": 0.29822579082757045, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7077, + "step": 149 + }, + { + "epoch": 0.96, + "grad_norm": 0.3080627982411816, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7625, + "step": 150 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3623747242759375, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7701, + "step": 151 + }, + { + "epoch": 0.9728, + "grad_norm": 0.30089343943365976, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7548, + "step": 152 + }, + { + "epoch": 0.9792, + "grad_norm": 0.36369213030522163, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8372, + "step": 153 + }, + { + "epoch": 0.9856, + "grad_norm": 0.28995750962922084, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7195, + "step": 154 + }, + { + "epoch": 0.992, + "grad_norm": 0.34177664411092873, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7391, + "step": 155 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3122340778319871, + "learning_rate": 0.0, + "loss": 0.6868, + "step": 156 + }, + { + "epoch": 0.9984, + "step": 156, + "total_flos": 391611316568064.0, + "train_loss": 0.8046467529657559, + "train_runtime": 4784.5328, + "train_samples_per_second": 1.045, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 156, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 391611316568064.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c6cb7b0be76f388e90234ff3702db6268a74b8f7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "down_proj", + "gate_proj", + "v_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2515f7877f8d12eda55aa1af8124ff7a5ecd086a --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d00a83ea5723b0b88550c9d1f34887a155fa60b2a6ccefb9fdfcb07455f3424d +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..07fd6496e5ef7917a07439013b942420b77b7776 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8964d7a41830da964ca1f64c6c82fa7fdee55315b6f4e96b3cef02c9324c4f70 +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9cbfde9eff16d3fd72a5bfecdf3a39fa008c5fa9 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9475372509233241, + "learning_rate": 2e-05, + "loss": 1.326, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8255112618496162, + "learning_rate": 4e-05, + "loss": 1.2227, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8031490881581672, + "learning_rate": 6e-05, + "loss": 1.2749, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7386472491239978, + "learning_rate": 8e-05, + "loss": 1.2729, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.6123801345342339, + "learning_rate": 0.0001, + "loss": 0.9416, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.9748162205092032, + "learning_rate": 0.00012, + "loss": 1.14, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.9195383903965376, + "learning_rate": 0.00014, + "loss": 0.9798, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.9550980567072939, + "learning_rate": 0.00016, + "loss": 1.1364, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6358806958616194, + "learning_rate": 0.00018, + "loss": 0.9806, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.6584899966952839, + "learning_rate": 0.0002, + "loss": 0.8835, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.49892591939119807, + "learning_rate": 0.00019999458931878073, + "loss": 0.8981, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4962301388144038, + "learning_rate": 0.0001999783578606323, + "loss": 0.8893, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5697491748058041, + "learning_rate": 0.00019995130738201966, + "loss": 0.9026, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6481407921914579, + "learning_rate": 0.0001999134408101731, + "loss": 0.9356, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.678142222999374, + "learning_rate": 0.00019986476224277165, + "loss": 0.9462, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.6228716926229473, + "learning_rate": 0.00019980527694749952, + "loss": 0.9257, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.562202486455815, + "learning_rate": 0.00019973499136147606, + "loss": 0.8431, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5479913775413893, + "learning_rate": 0.0001996539130905593, + "loss": 0.8926, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.6080925105746661, + "learning_rate": 0.0001995620509085228, + "loss": 1.014, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.5065288186619548, + "learning_rate": 0.00019945941475610623, + "loss": 0.8423, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5159448796732826, + "learning_rate": 0.0001993460157399396, + "loss": 0.9139, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.5039012397213576, + "learning_rate": 0.0001992218661313415, + "loss": 0.9021, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5287943390341321, + "learning_rate": 0.00019908697936499103, + "loss": 0.9534, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4689921770463583, + "learning_rate": 0.00019894137003747403, + "loss": 0.8669, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.48284728730041415, + "learning_rate": 0.00019878505390570362, + "loss": 0.8328, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4965084783063032, + "learning_rate": 0.00019861804788521493, + "loss": 0.745, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5905989291752782, + "learning_rate": 0.00019844037004833473, + "loss": 0.9137, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.47700271812675477, + "learning_rate": 0.00019825203962222572, + "loss": 0.8862, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.5549112880831628, + "learning_rate": 0.0001980530769868059, + "loss": 0.8866, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.532322622933053, + "learning_rate": 0.00019784350367254322, + "loss": 0.9127, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.443849424472922, + "learning_rate": 0.0001976233423581255, + "loss": 0.85, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.5980883187850191, + "learning_rate": 0.0001973926168680066, + "loss": 0.9338, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.47110264880609937, + "learning_rate": 0.00019715135216982798, + "loss": 0.8631, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5075464761050624, + "learning_rate": 0.0001968995743717171, + "loss": 0.8048, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.5843680350834338, + "learning_rate": 0.00019663731071946206, + "loss": 0.9154, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5819950358122334, + "learning_rate": 0.00019636458959356316, + "loss": 0.9255, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5178130228650352, + "learning_rate": 0.0001960814405061619, + "loss": 0.7974, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4602445891115539, + "learning_rate": 0.00019578789409784727, + "loss": 0.8179, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.518934498544055, + "learning_rate": 0.00019548398213434007, + "loss": 0.9162, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.4747867659375649, + "learning_rate": 0.00019516973750305532, + "loss": 0.8348, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 1.129356077205779, + "learning_rate": 0.00019484519420954354, + "loss": 0.7498, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5438136919706106, + "learning_rate": 0.00019451038737381077, + "loss": 0.8675, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.487493061884578, + "learning_rate": 0.00019416535322651818, + "loss": 0.7767, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5820093925186012, + "learning_rate": 0.00019381012910506146, + "loss": 0.8538, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.514133066410687, + "learning_rate": 0.00019344475344953012, + "loss": 0.8992, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.5543124541671927, + "learning_rate": 0.00019306926579854821, + "loss": 0.8356, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5700850850672268, + "learning_rate": 0.00019268370678499533, + "loss": 0.9012, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4921387990437491, + "learning_rate": 0.0001922881181316097, + "loss": 0.8159, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4351184948516209, + "learning_rate": 0.00019188254264647337, + "loss": 0.8419, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.5393404388572136, + "learning_rate": 0.0001914670242183795, + "loss": 0.8815, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5002819136243063, + "learning_rate": 0.0001910416078120832, + "loss": 0.8719, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.47210034832096764, + "learning_rate": 0.0001906063394634356, + "loss": 0.8463, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.42704344797802224, + "learning_rate": 0.00019016126627440237, + "loss": 0.7851, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5050122125112612, + "learning_rate": 0.00018970643640796642, + "loss": 0.8484, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.46594147112433676, + "learning_rate": 0.000189241899082916, + "loss": 0.7922, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.6049026714505561, + "learning_rate": 0.00018876770456851877, + "loss": 0.7832, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5520570407070383, + "learning_rate": 0.0001882839041790818, + "loss": 0.9458, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.5258281889886353, + "learning_rate": 0.00018779055026839868, + "loss": 0.7662, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5042869992665732, + "learning_rate": 0.00018728769622408423, + "loss": 0.7838, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.4415498821824666, + "learning_rate": 0.00018677539646179707, + "loss": 0.7739, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.528068837310034, + "learning_rate": 0.00018625370641935129, + "loss": 0.8279, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4382614858545404, + "learning_rate": 0.00018572268255071718, + "loss": 0.816, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.45850262081352067, + "learning_rate": 0.00018518238231991218, + "loss": 0.8095, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4415344758160172, + "learning_rate": 0.00018463286419478255, + "loss": 0.7094, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.5341688157184947, + "learning_rate": 0.00018407418764067627, + "loss": 0.8289, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4477850495284644, + "learning_rate": 0.00018350641311400812, + "loss": 0.8187, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.46563284629446294, + "learning_rate": 0.0001829296020557174, + "loss": 0.8479, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5244429014213015, + "learning_rate": 0.00018234381688461942, + "loss": 0.8311, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4224889085914789, + "learning_rate": 0.0001817491209906506, + "loss": 0.7937, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.5169843727719514, + "learning_rate": 0.00018114557872800905, + "loss": 0.795, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.47188379389130514, + "learning_rate": 0.00018053325540819045, + "loss": 0.9052, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5536682110712087, + "learning_rate": 0.0001799122172929206, + "loss": 0.7486, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5019047354964509, + "learning_rate": 0.00017928253158698473, + "loss": 0.8023, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5247331966004386, + "learning_rate": 0.0001786442664309554, + "loss": 0.8345, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.5203590067022946, + "learning_rate": 0.0001779974908938184, + "loss": 0.83, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4781060465248561, + "learning_rate": 0.0001773422749654988, + "loss": 0.8388, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4669659122826244, + "learning_rate": 0.00017667868954928694, + "loss": 0.8135, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.46355569987707634, + "learning_rate": 0.00017600680645416583, + "loss": 0.731, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.5457644600238496, + "learning_rate": 0.00017532669838704035, + "loss": 0.828, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.43555675848315856, + "learning_rate": 0.00017463843894486937, + "loss": 0.7712, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4354177922283847, + "learning_rate": 0.0001739421026067017, + "loss": 0.766, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4618480255495938, + "learning_rate": 0.00017323776472561627, + "loss": 0.7458, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5179627468498007, + "learning_rate": 0.00017252550152056795, + "loss": 0.8463, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.6679783445582437, + "learning_rate": 0.0001718053900681397, + "loss": 0.8165, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.47708984031515167, + "learning_rate": 0.00017107750829420176, + "loss": 0.7847, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.48916241960280127, + "learning_rate": 0.00017034193496547902, + "loss": 0.8247, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.42819076627920327, + "learning_rate": 0.00016959874968102735, + "loss": 0.7198, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5551537423140956, + "learning_rate": 0.00016884803286362, + "loss": 0.8808, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.6185891989336596, + "learning_rate": 0.00016808986575104465, + "loss": 0.9487, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.48990222253922966, + "learning_rate": 0.00016732433038731242, + "loss": 0.83, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.584563973573214, + "learning_rate": 0.0001665515096137797, + "loss": 0.8531, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3822568873820786, + "learning_rate": 0.00016577148706018328, + "loss": 0.7736, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.39153513190589123, + "learning_rate": 0.00016498434713559088, + "loss": 0.7613, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4722500361916666, + "learning_rate": 0.00016419017501926656, + "loss": 0.8785, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.5432777140214171, + "learning_rate": 0.0001633890566514535, + "loss": 0.8845, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.42141522285453015, + "learning_rate": 0.00016258107872407375, + "loss": 0.7694, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5603710022840139, + "learning_rate": 0.0001617663286713474, + "loss": 0.9227, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4692676989268742, + "learning_rate": 0.00016094489466033043, + "loss": 0.8197, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.41431728254284184, + "learning_rate": 0.00016011686558137448, + "loss": 0.804, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.4303135895029476, + "learning_rate": 0.0001592823310385073, + "loss": 0.7709, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4132483714354605, + "learning_rate": 0.0001584413813397364, + "loss": 0.837, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.5513903573976224, + "learning_rate": 0.00015759410748727662, + "loss": 0.8217, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4367114402281363, + "learning_rate": 0.00015674060116770236, + "loss": 0.6912, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.481909466793859, + "learning_rate": 0.00015588095474202595, + "loss": 0.8313, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.5650200354143932, + "learning_rate": 0.00015501526123570277, + "loss": 0.8164, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.6003386843843564, + "learning_rate": 0.00015414361432856475, + "loss": 0.7437, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.41804442073857784, + "learning_rate": 0.0001532661083446829, + "loss": 0.7807, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4351168603450991, + "learning_rate": 0.00015238283824216015, + "loss": 0.7445, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4623350020448244, + "learning_rate": 0.00015149389960285558, + "loss": 0.8933, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.4373188970399322, + "learning_rate": 0.00015059938862204127, + "loss": 0.8076, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.45494271034653544, + "learning_rate": 0.00014969940209799248, + "loss": 0.7516, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.46107867226622695, + "learning_rate": 0.00014879403742151283, + "loss": 0.773, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.49119829743495835, + "learning_rate": 0.00014788339256539544, + "loss": 0.8354, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4899833574630175, + "learning_rate": 0.0001469675660738206, + "loss": 0.8039, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.4853333535137589, + "learning_rate": 0.00014604665705169237, + "loss": 0.8305, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.48298296036949917, + "learning_rate": 0.00014512076515391375, + "loss": 0.7502, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.45567966724483405, + "learning_rate": 0.00014418999057460276, + "loss": 0.7734, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5396135149138322, + "learning_rate": 0.0001432544340362501, + "loss": 0.8909, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.44065113727070815, + "learning_rate": 0.00014231419677881966, + "loss": 0.7733, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.4435587464426928, + "learning_rate": 0.00014136938054879283, + "loss": 0.8202, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4540090705940411, + "learning_rate": 0.00014042008758815818, + "loss": 0.788, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.422623541311413, + "learning_rate": 0.00013946642062334766, + "loss": 0.8095, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4333205443650321, + "learning_rate": 0.00013850848285411994, + "loss": 0.7995, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3981291884403016, + "learning_rate": 0.000137546377942393, + "loss": 0.7642, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.39105313135637965, + "learning_rate": 0.00013658021000102636, + "loss": 0.7737, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5037497450946807, + "learning_rate": 0.00013561008358255468, + "loss": 0.8073, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.5488664229378287, + "learning_rate": 0.00013463610366787392, + "loss": 0.8581, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.6003351085243592, + "learning_rate": 0.00013365837565488064, + "loss": 0.8258, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4091264449134919, + "learning_rate": 0.0001326770053470668, + "loss": 0.7357, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.5470462367821493, + "learning_rate": 0.0001316920989420703, + "loss": 0.8844, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4894282121746544, + "learning_rate": 0.00013070376302018287, + "loss": 0.7962, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4101795046595908, + "learning_rate": 0.00012971210453281674, + "loss": 0.7266, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.5068711466438554, + "learning_rate": 0.000128717230790931, + "loss": 0.8303, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.38242083266801624, + "learning_rate": 0.00012771924945341906, + "loss": 0.7646, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.42033032308999724, + "learning_rate": 0.00012671826851545851, + "loss": 0.7327, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4202205681205054, + "learning_rate": 0.0001257143962968246, + "loss": 0.7697, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4076882425961211, + "learning_rate": 0.00012470774143016853, + "loss": 0.7262, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3970760465743167, + "learning_rate": 0.00012369841284926188, + "loss": 0.7013, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3563358239374254, + "learning_rate": 0.00012268651977720866, + "loss": 0.7397, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 1.0149081113634044, + "learning_rate": 0.00012167217171462566, + "loss": 0.9153, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.47512757583798637, + "learning_rate": 0.0001206554784277931, + "loss": 0.7641, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4644157016218601, + "learning_rate": 0.00011963654993677645, + "loss": 0.7605, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5972922545320057, + "learning_rate": 0.00011861549650352069, + "loss": 0.8634, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5638793356740267, + "learning_rate": 0.00011759242861991855, + "loss": 0.8134, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.4208532395312315, + "learning_rate": 0.00011656745699585371, + "loss": 0.7711, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.501311946451056, + "learning_rate": 0.00011554069254722051, + "loss": 0.8352, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.39223467428190856, + "learning_rate": 0.00011451224638392129, + "loss": 0.7033, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.456994207970227, + "learning_rate": 0.00011348222979784289, + "loss": 0.8883, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.45684422134646335, + "learning_rate": 0.00011245075425081328, + "loss": 0.7564, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.42288440860117754, + "learning_rate": 0.00011141793136253986, + "loss": 0.8268, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3625161326066237, + "learning_rate": 0.0001103838728985307, + "loss": 0.7752, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.6363628609763268, + "learning_rate": 0.000109348690758, + "loss": 0.8428, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4388850237086708, + "learning_rate": 0.00010831249696175918, + "loss": 0.7613, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4238194413830428, + "learning_rate": 0.0001072754036400944, + "loss": 0.7368, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.4313229068700299, + "learning_rate": 0.00010623752302063283, + "loss": 0.7228, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3665458963175191, + "learning_rate": 0.00010519896741619803, + "loss": 0.7351, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4606084144955922, + "learning_rate": 0.00010415984921265609, + "loss": 0.7933, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.37454884179441855, + "learning_rate": 0.00010312028085675391, + "loss": 0.6785, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4685857988699956, + "learning_rate": 0.00010208037484395114, + "loss": 0.8421, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.4635048557171666, + "learning_rate": 0.00010104024370624644, + "loss": 0.7237, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.42921898225240124, + "learning_rate": 0.0001, + "loss": 0.7781, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3662956360921228, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7159, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.4557524466979869, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7561, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4893732127944404, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7776, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.41560284819860266, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8182, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.43583050097753673, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8272, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3975629083125364, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7031, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.43624658704902014, + "learning_rate": 9.272459635990562e-05, + "loss": 0.8014, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.42770672908711543, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7893, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.431287601565606, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7174, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4477161386575832, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7533, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4378644630155987, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8106, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4172200088521494, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7713, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.36721232643194673, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7491, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.4862813172573176, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7231, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.40621933997484455, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7486, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.39390182160705, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7062, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4423733823524504, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7722, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.42234610710821735, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7751, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.4790419250342842, + "learning_rate": 8.036345006322359e-05, + "loss": 0.844, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.47088007396822795, + "learning_rate": 7.934452157220694e-05, + "loss": 0.8215, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.45537714110666144, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8159, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.43357137618807773, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7605, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3827670305696227, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7338, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.513998983919058, + "learning_rate": 7.52922585698315e-05, + "loss": 0.8397, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.43364564030184066, + "learning_rate": 7.428560370317542e-05, + "loss": 0.771, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4675556367134949, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7752, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4049143779921121, + "learning_rate": 7.228075054658096e-05, + "loss": 0.749, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.4500976620473903, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7935, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.3995426553234267, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7244, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.44562703640404544, + "learning_rate": 6.929623697981718e-05, + "loss": 0.8293, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4051528567354779, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7884, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.4771405413248115, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7784, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3926186909030838, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7335, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.41976332389254317, + "learning_rate": 6.536389633212609e-05, + "loss": 0.8242, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4803382328813548, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7945, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5297101516864969, + "learning_rate": 6.341978999897365e-05, + "loss": 0.8094, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.49411259395914287, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7898, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.46352144102877846, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7439, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3696971917361302, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7595, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3814101337331313, + "learning_rate": 5.957991241184184e-05, + "loss": 0.721, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.40401538235653045, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7503, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.5564174440836027, + "learning_rate": 5.768580322118034e-05, + "loss": 0.839, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4199095129579862, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7874, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.5007298515301487, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.8153, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4824688967442501, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7891, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.40526171374102343, + "learning_rate": 5.395334294830765e-05, + "loss": 0.7297, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4166847918577331, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7311, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.49940816867089055, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7645, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.4994077839956053, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.8044, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.43752905673208514, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7281, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4041679096739365, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7274, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.4098059795351516, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7963, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.44718722572355823, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7985, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3682360366651651, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7541, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.46498255039759495, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8432, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5488964536417223, + "learning_rate": 4.498473876429726e-05, + "loss": 0.796, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.5073904499922492, + "learning_rate": 4.411904525797408e-05, + "loss": 0.8135, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3952360275521916, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7325, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.4104314612045499, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7144, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.7434300816992129, + "learning_rate": 4.155861866026364e-05, + "loss": 0.793, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4010089987617862, + "learning_rate": 4.071766896149273e-05, + "loss": 0.743, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3882211408530366, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6581, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.39187843713439985, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7531, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.4193424215582899, + "learning_rate": 3.823367132865265e-05, + "loss": 0.8272, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4475292298664944, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7227, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4525161513980812, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.661, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3651492327454228, + "learning_rate": 3.580982498073344e-05, + "loss": 0.6811, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.47595078991005435, + "learning_rate": 3.501565286440914e-05, + "loss": 0.7122, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.40883568477553434, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7013, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.4204261201847352, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7485, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.45569331512014916, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.759, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3605187290705392, + "learning_rate": 3.191013424895536e-05, + "loss": 0.752, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.43941323831982837, + "learning_rate": 3.115196713638e-05, + "loss": 0.7928, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.42392138054141804, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7183, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.6002181145748936, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.8757, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4704650269250136, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7607, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.45646416600668926, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.6787, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4457812809777728, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7402, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.3983232714534782, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7209, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3841756021972201, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7734, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3548275434989953, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6981, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5542249068953132, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.9009, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4614383625410636, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8271, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.48453095060087825, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7532, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5453650007330265, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7288, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.6577342749465136, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7854, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4020345678036813, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.6646, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.428764083923936, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7225, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.6598289868683983, + "learning_rate": 2.008778270707944e-05, + "loss": 0.8133, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4611059514614476, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7492, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4346383906696144, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7972, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4903650685366549, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7085, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.34351000406824495, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6492, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.37624588282483906, + "learning_rate": 1.707039794428259e-05, + "loss": 0.6955, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.42553446492000263, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6734, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 2.210252347918217, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6773, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.4371212286257628, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7589, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.36722660195541307, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.6387, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.5240922919354317, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7771, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.48516917315907054, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.7286, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4357039541849848, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7802, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4283435397363714, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7599, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3981078881096444, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7931, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.39156672531944486, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.7401, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.48543910561325665, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.8197, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.4378682167583888, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7549, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.38647455959198573, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7263, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3735332505931595, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7155, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.4379368538027031, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7484, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3882393713789715, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7211, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.40882868968147507, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7205, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 1.3945997112526465, + "learning_rate": 8.117457353526625e-06, + "loss": 0.8109, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.49057638899855605, + "learning_rate": 7.711881868390291e-06, + "loss": 0.9028, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.4188288937856507, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7493, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.40361671819863904, + "learning_rate": 6.930734201451816e-06, + "loss": 0.719, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.6218687942184031, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7181, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4276015947492314, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7432, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3893696064158002, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7316, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.5573916049390001, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6943, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.4209353071823604, + "learning_rate": 5.154805790456485e-06, + "loss": 0.8095, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.5022987674918924, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7704, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4058450991396016, + "learning_rate": 4.516017865659949e-06, + "loss": 0.7177, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.356976458199042, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6557, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.4559370470859033, + "learning_rate": 3.918559493838114e-06, + "loss": 0.709, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4628851942692234, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.8099, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.6231695580539695, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7739, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5341918529954933, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8079, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3898975739162987, + "learning_rate": 2.848647830172024e-06, + "loss": 0.6931, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.4323735885773568, + "learning_rate": 2.607383131993424e-06, + "loss": 0.8201, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.42396609382981215, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7612, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.46687228082218096, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7789, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.3725562535302593, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6842, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.49985405407575395, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.8269, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.3805866889061923, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7256, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.44370791874981386, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7253, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4260670162379159, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7221, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.41106896725164604, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6774, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.40712095378486146, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7613, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.42998666569186, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7444, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5435849941142359, + "learning_rate": 6.539842600603918e-07, + "loss": 0.7975, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4555921328168688, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7239, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4302517927607182, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7524, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.43365261945779077, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7458, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.4966460762888895, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7581, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5136891806956415, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8995, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3911369216152323, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7568, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.38558731844673394, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6653, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.45316318083820073, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7015, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.4903919882922413, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7682, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.40877464708263694, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6875, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4338634341048958, + "learning_rate": 0.0, + "loss": 0.6674, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 268297746972672.0, + "train_loss": 0.7987522908892387, + "train_runtime": 4818.1513, + "train_samples_per_second": 1.038, + "train_steps_per_second": 0.065 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 268297746972672.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b7f3c8ddd916a07f9ddb19b2de2d535d74d86617 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "v_proj", + "gate_proj", + "down_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0217b29baf77a23b7fd9449433bd613b54573b26 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d5ab49d0cbb59628b39a81f071d33fa784c87d1d63facadfc4e1f6ade35885d +size 671150064 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..de454bc4c8d12af371b63ecda74dc98ff8028b75 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccea06363f7010d274b2717119338b843a180aa3acd4b6a001cc95cad582498e +size 918507402 diff --git a/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e22104f95e4f0b37dffb8c21d73c27a58f964bc9 --- /dev/null +++ b/mixing_strategies/Stratified/bugsBunny-v1_1-Llama-3-8B-V-Stratified_dataset_5000_repeat_3_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,1134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064, + "grad_norm": 0.798103541351361, + "learning_rate": 4e-05, + "loss": 1.2744, + "step": 1 + }, + { + "epoch": 0.0128, + "grad_norm": 0.8268256866411904, + "learning_rate": 8e-05, + "loss": 1.356, + "step": 2 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6024838262870713, + "learning_rate": 0.00012, + "loss": 1.1987, + "step": 3 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7090874925679889, + "learning_rate": 0.00016, + "loss": 1.1633, + "step": 4 + }, + { + "epoch": 0.032, + "grad_norm": 0.7767469984784777, + "learning_rate": 0.0002, + "loss": 1.0355, + "step": 5 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5851130403593353, + "learning_rate": 0.0001999783578606323, + "loss": 0.9556, + "step": 6 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5006197073828017, + "learning_rate": 0.0001999134408101731, + "loss": 0.9518, + "step": 7 + }, + { + "epoch": 0.0512, + "grad_norm": 0.613689066369758, + "learning_rate": 0.00019980527694749952, + "loss": 0.9631, + "step": 8 + }, + { + "epoch": 0.0576, + "grad_norm": 0.415328797805894, + "learning_rate": 0.0001996539130905593, + "loss": 0.8929, + "step": 9 + }, + { + "epoch": 0.064, + "grad_norm": 0.4515627425180524, + "learning_rate": 0.00019945941475610623, + "loss": 0.9557, + "step": 10 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4575354927834304, + "learning_rate": 0.0001992218661313415, + "loss": 0.9313, + "step": 11 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5521896687236068, + "learning_rate": 0.00019894137003747403, + "loss": 0.9291, + "step": 12 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3977577767442164, + "learning_rate": 0.00019861804788521493, + "loss": 0.8088, + "step": 13 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4657450149066985, + "learning_rate": 0.00019825203962222572, + "loss": 0.9159, + "step": 14 + }, + { + "epoch": 0.096, + "grad_norm": 0.402853389805612, + "learning_rate": 0.00019784350367254322, + "loss": 0.9126, + "step": 15 + }, + { + "epoch": 0.1024, + "grad_norm": 0.7706018254819981, + "learning_rate": 0.0001973926168680066, + "loss": 0.9101, + "step": 16 + }, + { + "epoch": 0.1088, + "grad_norm": 0.38828276045292986, + "learning_rate": 0.0001968995743717171, + "loss": 0.8458, + "step": 17 + }, + { + "epoch": 0.1152, + "grad_norm": 0.46581785750217314, + "learning_rate": 0.00019636458959356316, + "loss": 0.9317, + "step": 18 + }, + { + "epoch": 0.1216, + "grad_norm": 0.38190582131334216, + "learning_rate": 0.00019578789409784727, + "loss": 0.8162, + "step": 19 + }, + { + "epoch": 0.128, + "grad_norm": 0.3774002656162366, + "learning_rate": 0.00019516973750305532, + "loss": 0.8754, + "step": 20 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4236226446364666, + "learning_rate": 0.00019451038737381077, + "loss": 0.8062, + "step": 21 + }, + { + "epoch": 0.1408, + "grad_norm": 0.38796693490725326, + "learning_rate": 0.00019381012910506146, + "loss": 0.8203, + "step": 22 + }, + { + "epoch": 0.1472, + "grad_norm": 0.38039026359896194, + "learning_rate": 0.00019306926579854821, + "loss": 0.8685, + "step": 23 + }, + { + "epoch": 0.1536, + "grad_norm": 0.386980713471517, + "learning_rate": 0.0001922881181316097, + "loss": 0.8575, + "step": 24 + }, + { + "epoch": 0.16, + "grad_norm": 0.35314630403308767, + "learning_rate": 0.0001914670242183795, + "loss": 0.8595, + "step": 25 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3779210500854245, + "learning_rate": 0.0001906063394634356, + "loss": 0.8573, + "step": 26 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3297621731524984, + "learning_rate": 0.00018970643640796642, + "loss": 0.8145, + "step": 27 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3394455923366161, + "learning_rate": 0.00018876770456851877, + "loss": 0.7879, + "step": 28 + }, + { + "epoch": 0.1856, + "grad_norm": 0.375977988038427, + "learning_rate": 0.00018779055026839868, + "loss": 0.8579, + "step": 29 + }, + { + "epoch": 0.192, + "grad_norm": 0.340761944056107, + "learning_rate": 0.00018677539646179707, + "loss": 0.7752, + "step": 30 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3455332956794238, + "learning_rate": 0.00018572268255071718, + "loss": 0.8181, + "step": 31 + }, + { + "epoch": 0.2048, + "grad_norm": 0.32710973018478223, + "learning_rate": 0.00018463286419478255, + "loss": 0.7574, + "step": 32 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3500517950500944, + "learning_rate": 0.00018350641311400812, + "loss": 0.8229, + "step": 33 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3595737134142622, + "learning_rate": 0.00018234381688461942, + "loss": 0.8348, + "step": 34 + }, + { + "epoch": 0.224, + "grad_norm": 0.3177571851143064, + "learning_rate": 0.00018114557872800905, + "loss": 0.7854, + "step": 35 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3286868016903798, + "learning_rate": 0.0001799122172929206, + "loss": 0.8262, + "step": 36 + }, + { + "epoch": 0.2368, + "grad_norm": 0.36845715233852266, + "learning_rate": 0.0001786442664309554, + "loss": 0.8182, + "step": 37 + }, + { + "epoch": 0.2432, + "grad_norm": 0.35277395281465673, + "learning_rate": 0.0001773422749654988, + "loss": 0.8268, + "step": 38 + }, + { + "epoch": 0.2496, + "grad_norm": 0.34235646542887793, + "learning_rate": 0.00017600680645416583, + "loss": 0.774, + "step": 39 + }, + { + "epoch": 0.256, + "grad_norm": 0.32801991248244367, + "learning_rate": 0.00017463843894486937, + "loss": 0.7912, + "step": 40 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3209126529312384, + "learning_rate": 0.00017323776472561627, + "loss": 0.7542, + "step": 41 + }, + { + "epoch": 0.2688, + "grad_norm": 0.40794489864543393, + "learning_rate": 0.0001718053900681397, + "loss": 0.8257, + "step": 42 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3408888492524192, + "learning_rate": 0.00017034193496547902, + "loss": 0.8063, + "step": 43 + }, + { + "epoch": 0.2816, + "grad_norm": 0.35959807729052756, + "learning_rate": 0.00016884803286362, + "loss": 0.7978, + "step": 44 + }, + { + "epoch": 0.288, + "grad_norm": 0.4259774615159538, + "learning_rate": 0.00016732433038731242, + "loss": 0.8795, + "step": 45 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3448967552992956, + "learning_rate": 0.00016577148706018328, + "loss": 0.8024, + "step": 46 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3272033029206055, + "learning_rate": 0.00016419017501926656, + "loss": 0.8139, + "step": 47 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3479088858440763, + "learning_rate": 0.00016258107872407375, + "loss": 0.8182, + "step": 48 + }, + { + "epoch": 0.3136, + "grad_norm": 0.359132852878714, + "learning_rate": 0.00016094489466033043, + "loss": 0.8594, + "step": 49 + }, + { + "epoch": 0.32, + "grad_norm": 0.31490254874430174, + "learning_rate": 0.0001592823310385073, + "loss": 0.7838, + "step": 50 + }, + { + "epoch": 0.3264, + "grad_norm": 0.354879440739144, + "learning_rate": 0.00015759410748727662, + "loss": 0.8281, + "step": 51 + }, + { + "epoch": 0.3328, + "grad_norm": 0.327843049496559, + "learning_rate": 0.00015588095474202595, + "loss": 0.7506, + "step": 52 + }, + { + "epoch": 0.3392, + "grad_norm": 0.40599580002804275, + "learning_rate": 0.00015414361432856475, + "loss": 0.7735, + "step": 53 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3139259928133556, + "learning_rate": 0.00015238283824216015, + "loss": 0.7594, + "step": 54 + }, + { + "epoch": 0.352, + "grad_norm": 0.34277518713692245, + "learning_rate": 0.00015059938862204127, + "loss": 0.8431, + "step": 55 + }, + { + "epoch": 0.3584, + "grad_norm": 0.337432838751828, + "learning_rate": 0.00014879403742151283, + "loss": 0.7585, + "step": 56 + }, + { + "epoch": 0.3648, + "grad_norm": 0.36705196571229975, + "learning_rate": 0.0001469675660738206, + "loss": 0.8127, + "step": 57 + }, + { + "epoch": 0.3712, + "grad_norm": 0.33158871878903523, + "learning_rate": 0.00014512076515391375, + "loss": 0.7825, + "step": 58 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3439731934454538, + "learning_rate": 0.0001432544340362501, + "loss": 0.8281, + "step": 59 + }, + { + "epoch": 0.384, + "grad_norm": 0.318083568421075, + "learning_rate": 0.00014136938054879283, + "loss": 0.7892, + "step": 60 + }, + { + "epoch": 0.3904, + "grad_norm": 0.31565952036149936, + "learning_rate": 0.00013946642062334766, + "loss": 0.7911, + "step": 61 + }, + { + "epoch": 0.3968, + "grad_norm": 0.30865906135962495, + "learning_rate": 0.000137546377942393, + "loss": 0.7773, + "step": 62 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3302165770043087, + "learning_rate": 0.00013561008358255468, + "loss": 0.7857, + "step": 63 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4252224960949748, + "learning_rate": 0.00013365837565488064, + "loss": 0.842, + "step": 64 + }, + { + "epoch": 0.416, + "grad_norm": 0.3603281725920143, + "learning_rate": 0.0001316920989420703, + "loss": 0.7999, + "step": 65 + }, + { + "epoch": 0.4224, + "grad_norm": 0.31189541399551407, + "learning_rate": 0.00012971210453281674, + "loss": 0.7535, + "step": 66 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3087015288022916, + "learning_rate": 0.00012771924945341906, + "loss": 0.7915, + "step": 67 + }, + { + "epoch": 0.4352, + "grad_norm": 0.305018125969194, + "learning_rate": 0.0001257143962968246, + "loss": 0.7432, + "step": 68 + }, + { + "epoch": 0.4416, + "grad_norm": 0.2934928706117203, + "learning_rate": 0.00012369841284926188, + "loss": 0.7104, + "step": 69 + }, + { + "epoch": 0.448, + "grad_norm": 0.3030298084243325, + "learning_rate": 0.00012167217171462566, + "loss": 0.8268, + "step": 70 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3438139485734787, + "learning_rate": 0.00011963654993677645, + "loss": 0.7616, + "step": 71 + }, + { + "epoch": 0.4608, + "grad_norm": 0.39297250837172915, + "learning_rate": 0.00011759242861991855, + "loss": 0.8398, + "step": 72 + }, + { + "epoch": 0.4672, + "grad_norm": 0.33471051721291323, + "learning_rate": 0.00011554069254722051, + "loss": 0.8027, + "step": 73 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3065066927350221, + "learning_rate": 0.00011348222979784289, + "loss": 0.7905, + "step": 74 + }, + { + "epoch": 0.48, + "grad_norm": 0.33318943398753137, + "learning_rate": 0.00011141793136253986, + "loss": 0.7853, + "step": 75 + }, + { + "epoch": 0.4864, + "grad_norm": 0.42129473104900184, + "learning_rate": 0.000109348690758, + "loss": 0.8072, + "step": 76 + }, + { + "epoch": 0.4928, + "grad_norm": 0.32436875336084614, + "learning_rate": 0.0001072754036400944, + "loss": 0.7448, + "step": 77 + }, + { + "epoch": 0.4992, + "grad_norm": 0.27784398294634927, + "learning_rate": 0.00010519896741619803, + "loss": 0.7221, + "step": 78 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3055115279913575, + "learning_rate": 0.00010312028085675391, + "loss": 0.7319, + "step": 79 + }, + { + "epoch": 0.512, + "grad_norm": 0.31640581286782005, + "learning_rate": 0.00010104024370624644, + "loss": 0.7814, + "step": 80 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2933834108033171, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7461, + "step": 81 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3231004000455766, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7646, + "step": 82 + }, + { + "epoch": 0.5312, + "grad_norm": 0.32688724585582374, + "learning_rate": 9.480103258380198e-05, + "loss": 0.8137, + "step": 83 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3215700871727219, + "learning_rate": 9.272459635990562e-05, + "loss": 0.7467, + "step": 84 + }, + { + "epoch": 0.544, + "grad_norm": 0.3225959581760687, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7534, + "step": 85 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3380772436668406, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7856, + "step": 86 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3947076794390273, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7596, + "step": 87 + }, + { + "epoch": 0.5632, + "grad_norm": 0.32682860564478994, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7352, + "step": 88 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3107025811034861, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7378, + "step": 89 + }, + { + "epoch": 0.576, + "grad_norm": 0.3431174414700978, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8123, + "step": 90 + }, + { + "epoch": 0.5824, + "grad_norm": 0.41848267196202366, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8216, + "step": 91 + }, + { + "epoch": 0.5888, + "grad_norm": 0.32206129050676485, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7476, + "step": 92 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3797253454059359, + "learning_rate": 7.428560370317542e-05, + "loss": 0.8047, + "step": 93 + }, + { + "epoch": 0.6016, + "grad_norm": 0.37579587132321085, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7657, + "step": 94 + }, + { + "epoch": 0.608, + "grad_norm": 0.3144160021303849, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7588, + "step": 95 + }, + { + "epoch": 0.6144, + "grad_norm": 0.31872120942680743, + "learning_rate": 6.830790105792973e-05, + "loss": 0.8133, + "step": 96 + }, + { + "epoch": 0.6208, + "grad_norm": 0.33461256383139165, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7537, + "step": 97 + }, + { + "epoch": 0.6272, + "grad_norm": 0.34544511746242657, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8093, + "step": 98 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3556859669902957, + "learning_rate": 6.245362205760704e-05, + "loss": 0.8024, + "step": 99 + }, + { + "epoch": 0.64, + "grad_norm": 0.2975776942983784, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7544, + "step": 100 + }, + { + "epoch": 0.6464, + "grad_norm": 0.2924978663967572, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7422, + "step": 101 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3633837849321103, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8153, + "step": 102 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3559642361047184, + "learning_rate": 5.487923484608629e-05, + "loss": 0.804, + "step": 103 + }, + { + "epoch": 0.6656, + "grad_norm": 0.30450721922184676, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7354, + "step": 104 + }, + { + "epoch": 0.672, + "grad_norm": 0.4018138547264168, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7881, + "step": 105 + }, + { + "epoch": 0.6784, + "grad_norm": 0.31572731104326274, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7304, + "step": 106 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3202202734392908, + "learning_rate": 4.761716175783989e-05, + "loss": 0.8013, + "step": 107 + }, + { + "epoch": 0.6912, + "grad_norm": 0.32290299485712043, + "learning_rate": 4.585638567143529e-05, + "loss": 0.8035, + "step": 108 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3848146563924532, + "learning_rate": 4.411904525797408e-05, + "loss": 0.8116, + "step": 109 + }, + { + "epoch": 0.704, + "grad_norm": 0.3013951272060788, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7257, + "step": 110 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3322449631033, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7721, + "step": 111 + }, + { + "epoch": 0.7168, + "grad_norm": 0.30745286582323006, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7092, + "step": 112 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3085363085926714, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7815, + "step": 113 + }, + { + "epoch": 0.7296, + "grad_norm": 0.27770721320241926, + "learning_rate": 3.580982498073344e-05, + "loss": 0.6754, + "step": 114 + }, + { + "epoch": 0.736, + "grad_norm": 0.3128107292452356, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7151, + "step": 115 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3136638116475556, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7577, + "step": 116 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3197174021974061, + "learning_rate": 3.115196713638e-05, + "loss": 0.7777, + "step": 117 + }, + { + "epoch": 0.7552, + "grad_norm": 0.38070102732191174, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.8077, + "step": 118 + }, + { + "epoch": 0.7616, + "grad_norm": 0.32484409248060714, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7242, + "step": 119 + }, + { + "epoch": 0.768, + "grad_norm": 0.30865721254742123, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7363, + "step": 120 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2782127109708396, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7441, + "step": 121 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3862800418970225, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8741, + "step": 122 + }, + { + "epoch": 0.7872, + "grad_norm": 0.36622365622341957, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7481, + "step": 123 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4012346259284241, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7357, + "step": 124 + }, + { + "epoch": 0.8, + "grad_norm": 0.3533622717190259, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7763, + "step": 125 + }, + { + "epoch": 0.8064, + "grad_norm": 0.33243555929314145, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7787, + "step": 126 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3139784350612795, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6837, + "step": 127 + }, + { + "epoch": 0.8192, + "grad_norm": 0.2930961825783699, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6893, + "step": 128 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3132000826330831, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7264, + "step": 129 + }, + { + "epoch": 0.832, + "grad_norm": 0.33020574839410205, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7152, + "step": 130 + }, + { + "epoch": 0.8384, + "grad_norm": 0.36208350989479543, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7622, + "step": 131 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3025134339835463, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7842, + "step": 132 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3157431836217189, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7883, + "step": 133 + }, + { + "epoch": 0.8576, + "grad_norm": 0.30264971788974415, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7493, + "step": 134 + }, + { + "epoch": 0.864, + "grad_norm": 0.3020946816597262, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7403, + "step": 135 + }, + { + "epoch": 0.8704, + "grad_norm": 0.29825579056874574, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7303, + "step": 136 + }, + { + "epoch": 0.8768, + "grad_norm": 0.34805042849674267, + "learning_rate": 7.711881868390291e-06, + "loss": 0.868, + "step": 137 + }, + { + "epoch": 0.8832, + "grad_norm": 0.30748878976922644, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7443, + "step": 138 + }, + { + "epoch": 0.8896, + "grad_norm": 0.31312920419810836, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7381, + "step": 139 + }, + { + "epoch": 0.896, + "grad_norm": 0.3448656445594576, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7215, + "step": 140 + }, + { + "epoch": 0.9024, + "grad_norm": 0.34366821186403623, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7987, + "step": 141 + }, + { + "epoch": 0.9088, + "grad_norm": 0.2842539946388669, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6935, + "step": 142 + }, + { + "epoch": 0.9152, + "grad_norm": 0.33709204356585964, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7689, + "step": 143 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4085295140872269, + "learning_rate": 3.100425628282899e-06, + "loss": 0.8048, + "step": 144 + }, + { + "epoch": 0.928, + "grad_norm": 0.2966532953057819, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7672, + "step": 145 + }, + { + "epoch": 0.9344, + "grad_norm": 0.33309653198101075, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7779, + "step": 146 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3209119196082951, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7653, + "step": 147 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3002663218377736, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7318, + "step": 148 + }, + { + "epoch": 0.9536, + "grad_norm": 0.2974717069333661, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7078, + "step": 149 + }, + { + "epoch": 0.96, + "grad_norm": 0.3349525244186073, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7619, + "step": 150 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3704839046536139, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7687, + "step": 151 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3106434251504329, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7551, + "step": 152 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4092648868636652, + "learning_rate": 1.947230525005006e-07, + "loss": 0.8391, + "step": 153 + }, + { + "epoch": 0.9856, + "grad_norm": 0.2886810815093622, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7204, + "step": 154 + }, + { + "epoch": 0.992, + "grad_norm": 0.3459032074889401, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7398, + "step": 155 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3167726444330969, + "learning_rate": 0.0, + "loss": 0.6865, + "step": 156 + }, + { + "epoch": 0.9984, + "step": 156, + "total_flos": 391611316568064.0, + "train_loss": 0.8046577454377444, + "train_runtime": 4785.766, + "train_samples_per_second": 1.045, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 156, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 391611316568064.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e66c2bb24e52f99f14dcfa8f9782b020664fc4bd --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "up_proj", + "down_proj", + "gate_proj", + "q_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e47d0af5868b4d3b3bf4b2bca2faf540b151083a --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65b661237c42e5f110dfe3f432de15d54a240dfed342889c942cda0caf23b07d +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..9622252d9037762d6d42e97c1afb6aafca440ad9 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2de6d6d50a593dd1fef4b431769a19f06bab0ed049a2e5b55eceabbd2699d14 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a124a9f137de2e63b9f528ad27572d2ce2f217c9 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 0.7705987921299485, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.2058, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9287416636509861, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4134, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.6528262057381008, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1559, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6765882832433683, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1743, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.648079038578141, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2401, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6566661666440996, + "learning_rate": 6.31578947368421e-05, + "loss": 1.1006, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.8548354477419776, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0458, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.543740331558598, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9982, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.7318228644416274, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9406, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.7417915994317061, + "learning_rate": 0.00010526315789473685, + "loss": 1.0351, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5922704421680609, + "learning_rate": 0.00011578947368421053, + "loss": 0.9339, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5862372762823224, + "learning_rate": 0.0001263157894736842, + "loss": 0.9481, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.535213198327505, + "learning_rate": 0.0001368421052631579, + "loss": 0.9431, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5089424410764984, + "learning_rate": 0.00014736842105263158, + "loss": 0.9458, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.5097782413871194, + "learning_rate": 0.00015789473684210527, + "loss": 0.9507, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.47556040207833583, + "learning_rate": 0.00016842105263157895, + "loss": 0.8975, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5018954188069049, + "learning_rate": 0.00017894736842105264, + "loss": 0.9151, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5144020872477301, + "learning_rate": 0.00018947368421052632, + "loss": 0.9505, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.43303216280426504, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.5072332356758589, + "learning_rate": 0.00019999865623437013, + "loss": 0.8382, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.5120915383249092, + "learning_rate": 0.00019999462497359466, + "loss": 0.8401, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5421958032379722, + "learning_rate": 0.00019998790632601496, + "loss": 0.8774, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5150299514545867, + "learning_rate": 0.0001999785004721968, + "loss": 0.9377, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4721233844843988, + "learning_rate": 0.00019996640766492543, + "loss": 0.8796, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.4823898019933978, + "learning_rate": 0.00019995162822919883, + "loss": 0.8918, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4541381232034085, + "learning_rate": 0.00019993416256221895, + "loss": 0.8997, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4798261742844607, + "learning_rate": 0.00019991401113338104, + "loss": 0.8502, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4551744043933959, + "learning_rate": 0.00019989117448426108, + "loss": 0.9257, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.4775774615905478, + "learning_rate": 0.00019986565322860115, + "loss": 0.9352, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.48409776638768615, + "learning_rate": 0.00019983744805229296, + "loss": 0.8945, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.44386948052851427, + "learning_rate": 0.00019980655971335945, + "loss": 0.8241, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.48822961505738594, + "learning_rate": 0.00019977298904193437, + "loss": 0.9087, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4991405828880664, + "learning_rate": 0.00019973673694024, + "loss": 0.879, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.47567582863477037, + "learning_rate": 0.00019969780438256293, + "loss": 0.8483, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.51873526165059, + "learning_rate": 0.0001996561924152278, + "loss": 0.8829, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5011861028018147, + "learning_rate": 0.0001996119021565693, + "loss": 0.912, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.504469326779835, + "learning_rate": 0.0001995649347969019, + "loss": 0.8259, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3637338707859365, + "learning_rate": 0.00019951529159848805, + "loss": 0.7404, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4284421793999004, + "learning_rate": 0.00019946297389550433, + "loss": 0.7995, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.5034556480799789, + "learning_rate": 0.00019940798309400526, + "loss": 0.9058, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5436751511445966, + "learning_rate": 0.0001993503206718859, + "loss": 0.9353, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.42969339484572733, + "learning_rate": 0.00019928998817884182, + "loss": 0.8426, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.42490544608441266, + "learning_rate": 0.00019922698723632767, + "loss": 0.7866, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.44823434854700056, + "learning_rate": 0.00019916131953751342, + "loss": 0.8101, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.5603168797723098, + "learning_rate": 0.00019909298684723904, + "loss": 1.0164, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.47644193806778984, + "learning_rate": 0.00019902199100196697, + "loss": 0.8952, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.43382576577650683, + "learning_rate": 0.00019894833390973266, + "loss": 0.7644, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4590678040474791, + "learning_rate": 0.00019887201755009357, + "loss": 0.797, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.45059180280160405, + "learning_rate": 0.0001987930439740757, + "loss": 0.8682, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.458024289787967, + "learning_rate": 0.00019871141530411853, + "loss": 0.8254, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4708236548074165, + "learning_rate": 0.0001986271337340182, + "loss": 0.9232, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4107265066815477, + "learning_rate": 0.00019854020152886814, + "loss": 0.7776, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.44617773282303985, + "learning_rate": 0.0001984506210249986, + "loss": 0.8822, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5143311662028887, + "learning_rate": 0.00019835839462991361, + "loss": 0.9311, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.410377419821763, + "learning_rate": 0.00019826352482222638, + "loss": 0.8152, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.45038945127365737, + "learning_rate": 0.00019816601415159263, + "loss": 0.8273, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4689868980609412, + "learning_rate": 0.0001980658652386421, + "loss": 0.9484, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4086286046140391, + "learning_rate": 0.00019796308077490817, + "loss": 0.7996, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.3925097264811042, + "learning_rate": 0.00019785766352275542, + "loss": 0.7805, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.4468204196738424, + "learning_rate": 0.00019774961631530545, + "loss": 0.8896, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.427897958968694, + "learning_rate": 0.00019763894205636072, + "loss": 0.8762, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4694502565493321, + "learning_rate": 0.00019752564372032657, + "loss": 0.8481, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.44286505229187695, + "learning_rate": 0.00019740972435213115, + "loss": 0.7239, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.38451087694388386, + "learning_rate": 0.00019729118706714375, + "loss": 0.7907, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.474696902181304, + "learning_rate": 0.00019717003505109095, + "loss": 0.9338, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.3941485079754015, + "learning_rate": 0.00019704627155997108, + "loss": 0.8234, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.42634108625304457, + "learning_rate": 0.00019691989991996663, + "loss": 0.7857, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.43721561580101864, + "learning_rate": 0.0001967909235273549, + "loss": 0.755, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.41400298876290526, + "learning_rate": 0.00019665934584841682, + "loss": 0.8191, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.4111519335093886, + "learning_rate": 0.00019652517041934356, + "loss": 0.7836, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.3981205896778025, + "learning_rate": 0.00019638840084614182, + "loss": 0.8067, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.3944168309890051, + "learning_rate": 0.00019624904080453655, + "loss": 0.757, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4378681168720987, + "learning_rate": 0.00019610709403987246, + "loss": 0.823, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.45930105215478095, + "learning_rate": 0.00019596256436701324, + "loss": 0.8343, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.44690939308144634, + "learning_rate": 0.000195815455670239, + "loss": 0.8888, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.41052669970769035, + "learning_rate": 0.00019566577190314197, + "loss": 0.8003, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.42790787379025635, + "learning_rate": 0.0001955135170885202, + "loss": 0.7959, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4175218455704906, + "learning_rate": 0.00019535869531826937, + "loss": 0.8134, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.48152789594791556, + "learning_rate": 0.00019520131075327298, + "loss": 0.7923, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.37106931295853823, + "learning_rate": 0.00019504136762329047, + "loss": 0.7594, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.39560355388219715, + "learning_rate": 0.00019487887022684336, + "loss": 0.8018, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.423707151791688, + "learning_rate": 0.00019471382293110003, + "loss": 0.7834, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4120453080503355, + "learning_rate": 0.00019454623017175812, + "loss": 0.7546, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4113061305774698, + "learning_rate": 0.00019437609645292546, + "loss": 0.7901, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.46589571714326883, + "learning_rate": 0.0001942034263469989, + "loss": 0.8592, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4305164674690721, + "learning_rate": 0.00019402822449454153, + "loss": 0.8756, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.42281779829677335, + "learning_rate": 0.00019385049560415794, + "loss": 0.8405, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4525954367363889, + "learning_rate": 0.00019367024445236754, + "loss": 0.8727, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.45227946434780003, + "learning_rate": 0.00019348747588347637, + "loss": 0.8716, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.44211617864856445, + "learning_rate": 0.00019330219480944694, + "loss": 0.8879, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.40256649897418, + "learning_rate": 0.00019311440620976597, + "loss": 0.7606, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3741521146665791, + "learning_rate": 0.0001929241151313108, + "loss": 0.6961, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.3705518303191532, + "learning_rate": 0.00019273132668821364, + "loss": 0.7518, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4752188811217134, + "learning_rate": 0.00019253604606172417, + "loss": 0.8534, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.4102678943154998, + "learning_rate": 0.00019233827850007027, + "loss": 0.7634, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.45507608170811004, + "learning_rate": 0.00019213802931831696, + "loss": 0.8659, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.42180581722244276, + "learning_rate": 0.00019193530389822363, + "loss": 0.8058, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.460635684487175, + "learning_rate": 0.00019173010768809933, + "loss": 0.8622, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.3851113848164485, + "learning_rate": 0.0001915224462026563, + "loss": 0.7331, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.4544307900666547, + "learning_rate": 0.00019131232502286188, + "loss": 0.8508, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4125764591291438, + "learning_rate": 0.0001910997497957885, + "loss": 0.7378, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.43655716374423936, + "learning_rate": 0.00019088472623446183, + "loss": 0.7818, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.3955333708829416, + "learning_rate": 0.00019066726011770726, + "loss": 0.7435, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4356225922622882, + "learning_rate": 0.0001904473572899947, + "loss": 0.8667, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.4135943321838593, + "learning_rate": 0.00019022502366128135, + "loss": 0.7929, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.44086428372922054, + "learning_rate": 0.00019000026520685302, + "loss": 0.8279, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.47213515956851726, + "learning_rate": 0.0001897730879671634, + "loss": 0.9233, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.43531197249522713, + "learning_rate": 0.00018954349804767184, + "loss": 0.8459, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.5097410845471991, + "learning_rate": 0.00018931150161867916, + "loss": 0.882, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.46001640374981173, + "learning_rate": 0.00018907710491516199, + "loss": 0.8223, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4110670427542391, + "learning_rate": 0.0001888403142366049, + "loss": 0.7927, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.42224965826201943, + "learning_rate": 0.00018860113594683148, + "loss": 0.734, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.44770731481345016, + "learning_rate": 0.00018835957647383303, + "loss": 0.8318, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4718645249580385, + "learning_rate": 0.00018811564230959588, + "loss": 0.8198, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.43084154367305444, + "learning_rate": 0.00018786934000992688, + "loss": 0.7576, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.41664958460149204, + "learning_rate": 0.00018762067619427746, + "loss": 0.8228, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.5159539127338308, + "learning_rate": 0.00018736965754556528, + "loss": 0.8686, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.44146899039468684, + "learning_rate": 0.00018711629080999504, + "loss": 0.7658, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.42253703116326774, + "learning_rate": 0.00018686058279687698, + "loss": 0.7849, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.42141440612086756, + "learning_rate": 0.00018660254037844388, + "loss": 0.8632, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4173262660076212, + "learning_rate": 0.00018634217048966637, + "loss": 0.7567, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.6396271573551264, + "learning_rate": 0.0001860794801280666, + "loss": 0.8004, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.390100213608852, + "learning_rate": 0.0001858144763535302, + "loss": 0.745, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.40109375411157056, + "learning_rate": 0.0001855471662881164, + "loss": 0.7762, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.4503019228338576, + "learning_rate": 0.00018527755711586678, + "loss": 0.8385, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.45327963011421096, + "learning_rate": 0.00018500565608261214, + "loss": 0.8516, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.4354314586768901, + "learning_rate": 0.00018473147049577774, + "loss": 0.7582, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4035901466888465, + "learning_rate": 0.00018445500772418697, + "loss": 0.7655, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.48647013297209213, + "learning_rate": 0.00018417627519786315, + "loss": 0.8354, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.4250445114534265, + "learning_rate": 0.00018389528040783012, + "loss": 0.8026, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.3835530554003175, + "learning_rate": 0.00018361203090591071, + "loss": 0.7443, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5045517877721084, + "learning_rate": 0.00018332653430452376, + "loss": 0.9247, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.38472219697658894, + "learning_rate": 0.00018303879827647975, + "loss": 0.713, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.3970787060759494, + "learning_rate": 0.00018274883055477436, + "loss": 0.826, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.42215465881861736, + "learning_rate": 0.00018245663893238075, + "loss": 0.832, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3829002961198215, + "learning_rate": 0.00018216223126204007, + "loss": 0.7654, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.41525532207570953, + "learning_rate": 0.00018186561545605054, + "loss": 0.7949, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4596257126163404, + "learning_rate": 0.00018156679948605467, + "loss": 0.7851, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.45575584823464793, + "learning_rate": 0.00018126579138282503, + "loss": 0.8233, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.4460233322200534, + "learning_rate": 0.0001809625992360485, + "loss": 0.8032, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4421197126830367, + "learning_rate": 0.00018065723119410884, + "loss": 0.8437, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3893196845859463, + "learning_rate": 0.00018034969546386757, + "loss": 0.7453, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4592060372904219, + "learning_rate": 0.0001800400003104436, + "loss": 0.8468, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4385397356658086, + "learning_rate": 0.00017972815405699103, + "loss": 0.8273, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.46710183295178787, + "learning_rate": 0.00017941416508447536, + "loss": 0.7739, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4176634754089223, + "learning_rate": 0.0001790980418314484, + "loss": 0.7437, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.5003204720116483, + "learning_rate": 0.00017877979279382135, + "loss": 0.9364, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.47028599712650937, + "learning_rate": 0.0001784594265246366, + "loss": 0.7849, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4147643110815873, + "learning_rate": 0.0001781369516338378, + "loss": 0.8051, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.4211866455889534, + "learning_rate": 0.00017781237678803847, + "loss": 0.7602, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.42739548556178647, + "learning_rate": 0.000177485710710289, + "loss": 0.8049, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.34268327409876764, + "learning_rate": 0.00017715696217984235, + "loss": 0.6774, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.4228806711280117, + "learning_rate": 0.00017682614003191807, + "loss": 0.7527, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4334201330779741, + "learning_rate": 0.00017649325315746478, + "loss": 0.7945, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.5008077234098525, + "learning_rate": 0.0001761583105029213, + "loss": 0.8601, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.44269101925884186, + "learning_rate": 0.00017582132106997616, + "loss": 0.8329, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.46878674881645654, + "learning_rate": 0.00017548229391532572, + "loss": 0.8273, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.44699352618503435, + "learning_rate": 0.00017514123815043074, + "loss": 0.811, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4350488356067898, + "learning_rate": 0.00017479816294127152, + "loss": 0.8307, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.3891456533806176, + "learning_rate": 0.0001744530775081015, + "loss": 0.7757, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4624821361475699, + "learning_rate": 0.0001741059911251997, + "loss": 0.9079, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.42936710819614854, + "learning_rate": 0.000173756913120621, + "loss": 0.8596, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4414795774129634, + "learning_rate": 0.00017340585287594604, + "loss": 0.846, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4094980900461479, + "learning_rate": 0.0001730528198260285, + "loss": 0.8341, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.3935718900239617, + "learning_rate": 0.00017269782345874203, + "loss": 0.789, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.42602983206125355, + "learning_rate": 0.00017234087331472497, + "loss": 0.8319, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4693988113168292, + "learning_rate": 0.00017198197898712404, + "loss": 0.8866, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4527066533195711, + "learning_rate": 0.00017162115012133643, + "loss": 0.8334, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4315756796958446, + "learning_rate": 0.00017125839641475072, + "loss": 0.8035, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.38470897900882584, + "learning_rate": 0.00017089372761648616, + "loss": 0.7471, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4138045470291073, + "learning_rate": 0.00017052715352713075, + "loss": 0.8217, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4490076288389667, + "learning_rate": 0.00017015868399847768, + "loss": 0.8114, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.3831397215480176, + "learning_rate": 0.00016978832893326074, + "loss": 0.8037, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.40656339965363736, + "learning_rate": 0.00016941609828488807, + "loss": 0.8041, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.4230254535846176, + "learning_rate": 0.0001690420020571747, + "loss": 0.8037, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4502358942022541, + "learning_rate": 0.0001686660503040737, + "loss": 0.7649, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4133916093154814, + "learning_rate": 0.00016828825312940592, + "loss": 0.7642, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3996960998479458, + "learning_rate": 0.0001679086206865886, + "loss": 0.8166, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.44549862199892615, + "learning_rate": 0.00016752716317836229, + "loss": 0.8283, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.396495724613648, + "learning_rate": 0.0001671438908565167, + "loss": 0.8008, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.39841238324541506, + "learning_rate": 0.00016675881402161536, + "loss": 0.7687, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.39333262230313937, + "learning_rate": 0.0001663719430227186, + "loss": 0.7383, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.40407206038335314, + "learning_rate": 0.00016598328825710533, + "loss": 0.774, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.44971391392611154, + "learning_rate": 0.000165592860169994, + "loss": 0.7585, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.4461302770699972, + "learning_rate": 0.00016520066925426144, + "loss": 0.797, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.37971869846791584, + "learning_rate": 0.0001648067260501611, + "loss": 0.7014, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.41317053030930717, + "learning_rate": 0.0001644110411450398, + "loss": 0.7832, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4263929203142742, + "learning_rate": 0.00016401362517305296, + "loss": 0.7407, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.39562339260632456, + "learning_rate": 0.00016361448881487914, + "loss": 0.7781, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.5013959628903849, + "learning_rate": 0.00016321364279743266, + "loss": 0.9173, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.3942335902989073, + "learning_rate": 0.0001628110978935756, + "loss": 0.7969, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.37459670529215294, + "learning_rate": 0.00016240686492182804, + "loss": 0.7294, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.4210059470185516, + "learning_rate": 0.00016200095474607753, + "loss": 0.8715, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.41642305626617326, + "learning_rate": 0.00016159337827528685, + "loss": 0.7845, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.39990830100754915, + "learning_rate": 0.0001611841464632011, + "loss": 0.7446, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4303864965308377, + "learning_rate": 0.0001607732703080532, + "loss": 0.7968, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.39820698504610136, + "learning_rate": 0.00016036076085226814, + "loss": 0.7969, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.42663518600017114, + "learning_rate": 0.0001599466291821666, + "loss": 0.7582, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.41520429902798184, + "learning_rate": 0.0001595308864276666, + "loss": 0.8021, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.36322319811758375, + "learning_rate": 0.0001591135437619847, + "loss": 0.6502, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4163307297397158, + "learning_rate": 0.0001586946124013354, + "loss": 0.8172, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.37562605218750017, + "learning_rate": 0.0001582741036046301, + "loss": 0.7547, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4230534393680675, + "learning_rate": 0.00015785202867317407, + "loss": 0.8363, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4276005604829845, + "learning_rate": 0.00015742839895036305, + "loss": 0.8644, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.3939117069959653, + "learning_rate": 0.00015700322582137827, + "loss": 0.7293, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3796784177603523, + "learning_rate": 0.0001565765207128805, + "loss": 0.6843, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.38621167634662484, + "learning_rate": 0.0001561482950927029, + "loss": 0.8034, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.44536606693319236, + "learning_rate": 0.00015571856046954285, + "loss": 0.8331, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.3843524680432091, + "learning_rate": 0.00015528732839265272, + "loss": 0.6972, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.3776309616691966, + "learning_rate": 0.0001548546104515294, + "loss": 0.7132, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.424706653591385, + "learning_rate": 0.00015442041827560274, + "loss": 0.7772, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4362257286558965, + "learning_rate": 0.00015398476353392323, + "loss": 0.8142, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4144045790572788, + "learning_rate": 0.00015354765793484834, + "loss": 0.7776, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4182797718014241, + "learning_rate": 0.00015310911322572753, + "loss": 0.8402, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.4042797647645218, + "learning_rate": 0.000152669141192587, + "loss": 0.7901, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.39190545447558184, + "learning_rate": 0.00015222775365981273, + "loss": 0.7468, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4152515011993666, + "learning_rate": 0.00015178496248983254, + "loss": 0.8122, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.44268429313744734, + "learning_rate": 0.00015134077958279765, + "loss": 0.8285, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.3425723489745504, + "learning_rate": 0.00015089521687626243, + "loss": 0.7116, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.41744878887910053, + "learning_rate": 0.000150448286344864, + "loss": 0.7176, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.46331206241115097, + "learning_rate": 0.00015000000000000001, + "loss": 0.8512, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3613265930834721, + "learning_rate": 0.00014955036988950618, + "loss": 0.7438, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.38698802573239155, + "learning_rate": 0.00014909940809733222, + "loss": 0.7379, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.386993427342769, + "learning_rate": 0.00014864712674321734, + "loss": 0.7188, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.446411015740921, + "learning_rate": 0.00014819353798236427, + "loss": 0.8473, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4024015048695294, + "learning_rate": 0.00014773865400511272, + "loss": 0.7998, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.44200562601104565, + "learning_rate": 0.00014728248703661182, + "loss": 0.8643, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.44979198811554505, + "learning_rate": 0.00014682504933649144, + "loss": 0.8779, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.3683046668010399, + "learning_rate": 0.00014636635319853275, + "loss": 0.7407, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.3805387133659554, + "learning_rate": 0.00014590641095033787, + "loss": 0.735, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.4335537863789879, + "learning_rate": 0.00014544523495299842, + "loss": 0.8147, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4515454431913857, + "learning_rate": 0.0001449828376007636, + "loss": 0.9138, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.4071599274675956, + "learning_rate": 0.0001445192313207067, + "loss": 0.7195, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4448620554088051, + "learning_rate": 0.0001440544285723915, + "loss": 0.8262, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.40612385326640654, + "learning_rate": 0.00014358844184753712, + "loss": 0.7293, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4201371009731904, + "learning_rate": 0.00014312128366968243, + "loss": 0.7699, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.3817394982005736, + "learning_rate": 0.00014265296659384956, + "loss": 0.7096, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3843192659592516, + "learning_rate": 0.00014218350320620624, + "loss": 0.6966, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.354570749367614, + "learning_rate": 0.0001417129061237278, + "loss": 0.7078, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.4398836894186461, + "learning_rate": 0.00014124118799385796, + "loss": 0.8125, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4030011170075466, + "learning_rate": 0.00014076836149416887, + "loss": 0.7938, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.43157156210319664, + "learning_rate": 0.0001402944393320206, + "loss": 0.7683, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.5041538238832468, + "learning_rate": 0.00013981943424421932, + "loss": 0.8766, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3757364033441835, + "learning_rate": 0.00013934335899667527, + "loss": 0.7205, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.42081440661009456, + "learning_rate": 0.00013886622638405952, + "loss": 0.7702, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3894381312900156, + "learning_rate": 0.00013838804922946027, + "loss": 0.7194, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.41722293075106853, + "learning_rate": 0.00013790884038403795, + "loss": 0.7095, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3775195605592594, + "learning_rate": 0.00013742861272668012, + "loss": 0.7463, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.3781725749303525, + "learning_rate": 0.00013694737916365517, + "loss": 0.7386, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.3706384133547582, + "learning_rate": 0.00013646515262826552, + "loss": 0.718, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.42984820535863394, + "learning_rate": 0.0001359819460805001, + "loss": 0.7752, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4350068963361006, + "learning_rate": 0.0001354977725066859, + "loss": 0.7418, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.40270988988661344, + "learning_rate": 0.00013501264491913906, + "loss": 0.7238, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.44146982128082984, + "learning_rate": 0.0001345265763558152, + "loss": 0.8166, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.4160425881027499, + "learning_rate": 0.00013403957987995882, + "loss": 0.8187, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3793738450125677, + "learning_rate": 0.0001335516685797525, + "loss": 0.7636, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.39380344780082505, + "learning_rate": 0.00013306285556796495, + "loss": 0.771, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3999588927094165, + "learning_rate": 0.00013257315398159864, + "loss": 0.7631, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.35011976155280244, + "learning_rate": 0.00013208257698153677, + "loss": 0.7405, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.3691023639048862, + "learning_rate": 0.00013159113775218964, + "loss": 0.7313, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4276610856998654, + "learning_rate": 0.00013109884950114007, + "loss": 0.789, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4015299875013705, + "learning_rate": 0.00013060572545878875, + "loss": 0.7685, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.43478748123059957, + "learning_rate": 0.00013011177887799845, + "loss": 0.7292, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3647866649374095, + "learning_rate": 0.00012961702303373795, + "loss": 0.6115, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.3706666387908333, + "learning_rate": 0.00012912147122272523, + "loss": 0.7263, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4082112506331456, + "learning_rate": 0.00012862513676307008, + "loss": 0.7279, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3567764932832936, + "learning_rate": 0.00012812803299391628, + "loss": 0.7078, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.32385613327994045, + "learning_rate": 0.00012763017327508305, + "loss": 0.6632, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.41752017211552456, + "learning_rate": 0.0001271315709867059, + "loss": 0.7704, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.36484783789209513, + "learning_rate": 0.00012663223952887723, + "loss": 0.7171, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.3800517948110794, + "learning_rate": 0.00012613219232128608, + "loss": 0.6994, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.38449086574583846, + "learning_rate": 0.00012563144280285741, + "loss": 0.7564, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.4375588496174668, + "learning_rate": 0.00012513000443139112, + "loss": 0.7946, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4170392613463579, + "learning_rate": 0.00012462789068320017, + "loss": 0.7976, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.4013437633464495, + "learning_rate": 0.00012412511505274844, + "loss": 0.7602, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.40586730733844506, + "learning_rate": 0.00012362169105228826, + "loss": 0.7139, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.38522148446881194, + "learning_rate": 0.000123117632211497, + "loss": 0.706, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.42134563172884165, + "learning_rate": 0.00012261295207711346, + "loss": 0.7899, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.46908264078789513, + "learning_rate": 0.0001221076642125742, + "loss": 0.7233, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.3456513199532403, + "learning_rate": 0.00012160178219764837, + "loss": 0.7475, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.42382505862487274, + "learning_rate": 0.00012109531962807332, + "loss": 0.7127, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3848614543819859, + "learning_rate": 0.00012058829011518896, + "loss": 0.7472, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3754873003535694, + "learning_rate": 0.00012008070728557186, + "loss": 0.7421, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.35672849974354826, + "learning_rate": 0.00011957258478066931, + "loss": 0.6879, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.393706100119924, + "learning_rate": 0.00011906393625643244, + "loss": 0.7442, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5132659461901151, + "learning_rate": 0.00011855477538294935, + "loss": 0.8223, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.42096085231862373, + "learning_rate": 0.00011804511584407763, + "loss": 0.7911, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.41160139009461555, + "learning_rate": 0.00011753497133707679, + "loss": 0.7602, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.40370372309885755, + "learning_rate": 0.00011702435557223987, + "loss": 0.7522, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.3797479758946454, + "learning_rate": 0.00011651328227252517, + "loss": 0.7441, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4089049239884153, + "learning_rate": 0.00011600176517318741, + "loss": 0.7674, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3699422967708885, + "learning_rate": 0.00011548981802140848, + "loss": 0.7913, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4172489783980525, + "learning_rate": 0.00011497745457592816, + "loss": 0.7504, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3941674909826128, + "learning_rate": 0.00011446468860667421, + "loss": 0.7512, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.3717788781996277, + "learning_rate": 0.00011395153389439233, + "loss": 0.7018, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.36612187327758705, + "learning_rate": 0.00011343800423027582, + "loss": 0.7612, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4240098441909349, + "learning_rate": 0.0001129241134155949, + "loss": 0.7752, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4052278391226007, + "learning_rate": 0.00011240987526132594, + "loss": 0.7494, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.38791050098976654, + "learning_rate": 0.00011189530358778005, + "loss": 0.7352, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.4193408021718286, + "learning_rate": 0.00011138041222423177, + "loss": 0.8262, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3436289543752552, + "learning_rate": 0.00011086521500854745, + "loss": 0.6915, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.39333500810550576, + "learning_rate": 0.00011034972578681338, + "loss": 0.7322, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.3879541659537608, + "learning_rate": 0.00010983395841296348, + "loss": 0.7735, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3992620787556518, + "learning_rate": 0.00010931792674840718, + "loss": 0.7088, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.45595291894570794, + "learning_rate": 0.00010880164466165674, + "loss": 0.8255, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.39090092696923756, + "learning_rate": 0.00010828512602795462, + "loss": 0.7784, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.39554473367621173, + "learning_rate": 0.00010776838472890065, + "loss": 0.7205, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.37634005674867854, + "learning_rate": 0.00010725143465207867, + "loss": 0.7633, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3923165222100373, + "learning_rate": 0.00010673428969068364, + "loss": 0.7269, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.3802239727476873, + "learning_rate": 0.00010621696374314807, + "loss": 0.724, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3887117802908206, + "learning_rate": 0.00010569947071276847, + "loss": 0.779, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.38929208447851044, + "learning_rate": 0.00010518182450733186, + "loss": 0.7598, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4392462816587192, + "learning_rate": 0.00010466403903874176, + "loss": 0.8084, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4233610027255395, + "learning_rate": 0.00010414612822264455, + "loss": 0.8081, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.35551803094636486, + "learning_rate": 0.00010362810597805526, + "loss": 0.6632, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.43762388215444115, + "learning_rate": 0.0001031099862269837, + "loss": 0.8377, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.39059086643467694, + "learning_rate": 0.00010259178289406011, + "loss": 0.717, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.48759849975439196, + "learning_rate": 0.00010207350990616107, + "loss": 0.7588, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.36638834844002566, + "learning_rate": 0.0001015551811920351, + "loss": 0.7019, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.40404816221652523, + "learning_rate": 0.00010103681068192845, + "loss": 0.7781, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4178781138104464, + "learning_rate": 0.00010051841230721065, + "loss": 0.754, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4067391473152315, + "learning_rate": 0.0001, + "loss": 0.8082, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3527104968566196, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6608, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3968723263442942, + "learning_rate": 9.896318931807155e-05, + "loss": 0.719, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.39597297154816335, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7231, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3768104530544193, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6931, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.49124318766056924, + "learning_rate": 9.740821710593989e-05, + "loss": 0.8417, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4008540189956664, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7466, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.38033117101983577, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6648, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.42433433907030377, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6843, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.39575325823787383, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7249, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4350701258943475, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7909, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3969116712819791, + "learning_rate": 9.430052928723153e-05, + "loss": 0.8239, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3207117103886632, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6272, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.4350506568108911, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7648, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3727988120773944, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7107, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3684460888235613, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6512, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3782634227770135, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6954, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.41514391282200863, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7651, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.401028442423211, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6985, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3480899665203147, + "learning_rate": 9.016604158703654e-05, + "loss": 0.695, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.41568335772628967, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7377, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.39347165513947524, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7183, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.41172276913242867, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7699, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.3618416233059816, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6934, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.411553987932596, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7428, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3698943895704956, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7472, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.43346183874671734, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7254, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.3535648717952184, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6434, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.38838719596815624, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7129, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3424647163807555, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7146, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.442661686489916, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7515, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.42019776337895187, + "learning_rate": 8.399823482681262e-05, + "loss": 0.8075, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4379255306731001, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7847, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.42893827182216726, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7439, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3849300991614672, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7349, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.39934497863521856, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7044, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3910138678129991, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7603, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.37063618096573925, + "learning_rate": 8.093606374356759e-05, + "loss": 0.708, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.3797548351310327, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7516, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.367283331582471, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7282, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3693911710963683, + "learning_rate": 7.941170988481108e-05, + "loss": 0.742, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.37117296819053347, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7307, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4178706979946997, + "learning_rate": 7.839821780235168e-05, + "loss": 0.8086, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.3341662767235502, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6586, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.39130683157286156, + "learning_rate": 7.738704792288655e-05, + "loss": 0.722, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.37536743917135806, + "learning_rate": 7.688236778850306e-05, + "loss": 0.702, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3894262722031057, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7387, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.41955287473679065, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7357, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.4312111630989293, + "learning_rate": 7.537210931679987e-05, + "loss": 0.8228, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.39716106325643424, + "learning_rate": 7.48699955686089e-05, + "loss": 0.8019, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4178377909508333, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7513, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.44004532782831524, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7758, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4076132922530294, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7571, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.4384143849615539, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7597, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3824858096412295, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7163, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.41274801929946386, + "learning_rate": 7.187196700608373e-05, + "loss": 0.8296, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3778452359098377, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7169, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.83457178394802, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7542, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.4051074171497204, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6941, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.40839742549491664, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7335, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3599698010426839, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6727, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4044892607723987, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7579, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4136646261049131, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7691, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.4023757957141722, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7302, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3745309167149619, + "learning_rate": 6.742684601840141e-05, + "loss": 0.721, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4609841289633602, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7652, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.43918955408635996, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7718, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.38943471053372797, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7419, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.3641328317006478, + "learning_rate": 6.547342364418481e-05, + "loss": 0.6782, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.39054082564457465, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6946, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.4163721453017164, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7334, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.4040283264722037, + "learning_rate": 6.40180539194999e-05, + "loss": 0.746, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3899654799773014, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7533, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.3862854476843213, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6908, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.41090067469848174, + "learning_rate": 6.25713872733199e-05, + "loss": 0.8084, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3903151474186511, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6616, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3408539252470795, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6419, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4301941156068001, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6987, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.41468517833589413, + "learning_rate": 6.065664100332478e-05, + "loss": 0.8409, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.3641103522991886, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7095, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.37981059039886306, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7086, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.361243057344294, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6633, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.388195254180337, + "learning_rate": 5.875881200614207e-05, + "loss": 0.7363, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.3905619249093549, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6728, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3932491367342553, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6946, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.350177793935555, + "learning_rate": 5.73470334061505e-05, + "loss": 0.69, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3618476324694659, + "learning_rate": 5.687871633031754e-05, + "loss": 0.695, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.36566105958982537, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6729, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.37240137101971765, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7005, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.365534114600444, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6521, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4278148742820072, + "learning_rate": 5.501716239923642e-05, + "loss": 0.752, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3446300250209333, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6819, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4564423647918072, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7751, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.3482488541574724, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6676, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.429744562251442, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7065, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3555226566302722, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7224, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.40841608187723355, + "learning_rate": 5.226134599488728e-05, + "loss": 0.714, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.38425461037068026, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7388, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.3250019884787973, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6108, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.36666384292032755, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7037, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.34705434799655177, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6675, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.39998728379268955, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7274, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.36905428382946914, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7448, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.3369505694390931, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6504, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.38658903930111854, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6545, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.38341682365351915, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7368, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.37245313752497233, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7203, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.3925999422669356, + "learning_rate": 4.733085880741301e-05, + "loss": 0.744, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.4091867154975362, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7168, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.43459939651537244, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7363, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.37977781506193403, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7212, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.42916604552824417, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7584, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3811683876560328, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6965, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.36854117902095374, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7465, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3486796533755056, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6606, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4223137021437662, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7942, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.41888004844002535, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7344, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.35508306503610504, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6782, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.3660112193162608, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7241, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.4036480896953565, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7688, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.407694512647162, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7412, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.43662370557404956, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6908, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3634891497837515, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7257, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.40664977533567964, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7068, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.39365246871243736, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7766, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.4170646598756153, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7237, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3949223629278131, + "learning_rate": 3.922672969194686e-05, + "loss": 0.72, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.4066949646787602, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7605, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.3923750393538408, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7038, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.40529782786728324, + "learning_rate": 3.79990452539225e-05, + "loss": 0.7033, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4299209870019213, + "learning_rate": 3.759313507817196e-05, + "loss": 0.747, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.41000023145491954, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7578, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4427874287197096, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7156, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.4056149971087807, + "learning_rate": 3.638551118512089e-05, + "loss": 0.7204, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4735201037988204, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7071, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.390191805503046, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7422, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.42161085510406815, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6752, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.3634090280667586, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6786, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.4175867578453434, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6741, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.38021645890769495, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6667, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3629416835719003, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7011, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.37758032419229504, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6847, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.37926863810244416, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6843, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.40483928275961767, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7684, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4012428830040729, + "learning_rate": 3.209137931341143e-05, + "loss": 0.7423, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4531335988464249, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7868, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3724295350867727, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7229, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.31844459673385905, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6571, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.3817107137939138, + "learning_rate": 3.058390171511196e-05, + "loss": 0.658, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.41823823403293325, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7522, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3853953899148513, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7086, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.37444613159378504, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6852, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3364469465738454, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6309, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.3766916878108581, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6876, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3687270478967296, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.631, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.43006711836241546, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.7249, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.41376371976415266, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7457, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.39661335961253313, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7553, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.41278923081094665, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.7389, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.40790022202783777, + "learning_rate": 2.659414712405398e-05, + "loss": 0.7601, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.37338816286727505, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6919, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.444446835884112, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6802, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.43917108463930477, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7169, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.3782458515975163, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7329, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.44412516224453413, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7783, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3706509169863611, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6854, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3857956584271855, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6859, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4003236290741562, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7439, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.3760773928985438, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6723, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.36045457976400214, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6893, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 1.2048342969352837, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6331, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3674614487857438, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6861, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4309117087357981, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7497, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.3746159140343749, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6925, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.49526397518811116, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.8221, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3552528891639267, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6632, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3850478844126527, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7187, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.45331788252803906, + "learning_rate": 2.058583491552465e-05, + "loss": 0.7863, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.41113094587203364, + "learning_rate": 2.027184594300898e-05, + "loss": 0.7442, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.34173299346522756, + "learning_rate": 1.995999968955641e-05, + "loss": 0.621, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3918093267833597, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7471, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4071757676646653, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6827, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.36461233978685836, + "learning_rate": 1.903740076395151e-05, + "loss": 0.7157, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.4173732799907159, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.721, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.38034957823911675, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7238, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.36901819783380657, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7073, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.34235739707844226, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6765, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.359160494256085, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6573, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.36841471957322675, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6714, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.3746998392794317, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7187, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.4009926064824942, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.7293, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.39747495239080854, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7569, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3603387458815299, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6585, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.35799781284175525, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6808, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.44114057001829704, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7489, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.36499192034738037, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6568, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.40822150511019223, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7737, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.419435383777233, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7003, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.3673923491290724, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.688, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.4014301027660096, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.693, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.36667018873927276, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.679, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3792293799271566, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.7182, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.34501044137013914, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6377, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.3958280906980653, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.666, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.33412031700283, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6487, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3801984010243941, + "learning_rate": 1.263034245443473e-05, + "loss": 0.731, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.39145676353884284, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7134, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.4504736694689959, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7915, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.38525345734357347, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7145, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.32936707004333354, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6181, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.35780029503587424, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6665, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.36326556419103656, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6328, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.41195049470576833, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7343, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.35621015351537844, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6396, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.4792182923930832, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6594, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.6149215583954762, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.688, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4297627637067828, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7642, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.40345632075139887, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7588, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.3610496830175501, + "learning_rate": 9.552642710005299e-06, + "loss": 0.7162, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.41011685694858785, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7967, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3798344941908174, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6582, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3464177728333097, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6785, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4312418663664988, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6904, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.3948205989738892, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7135, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.43551744339062237, + "learning_rate": 8.269892311900696e-06, + "loss": 0.8037, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.42973270177790496, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7149, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3931138997667529, + "learning_rate": 7.861970681683051e-06, + "loss": 0.7311, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.4069832256194925, + "learning_rate": 7.661721499929753e-06, + "loss": 0.7481, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.35295605615077075, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6563, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.36970147203136483, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6784, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3993687277137508, + "learning_rate": 7.07588486868922e-06, + "loss": 0.7054, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3893239804460525, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6396, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3645677559830506, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6305, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.37258441212125676, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6641, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.406222026526566, + "learning_rate": 6.329755547632499e-06, + "loss": 0.7173, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.35254682706810464, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6441, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.34578045575046695, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6401, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.40371305535068225, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7299, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.6228207476057178, + "learning_rate": 5.623903547074549e-06, + "loss": 0.7163, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.3714632656937209, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6307, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3977346348790776, + "learning_rate": 5.286177068899989e-06, + "loss": 0.7076, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.3784668881873, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6746, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.37457939265841944, + "learning_rate": 4.95863237670956e-06, + "loss": 0.717, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.34966436288115627, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6452, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3901389579345433, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7063, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3690124987745926, + "learning_rate": 4.486482911479839e-06, + "loss": 0.7087, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3367381615082695, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.614, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.3846644802828723, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6728, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.4241241328617195, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7396, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.39826283281007263, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7462, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.35359225029925234, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7052, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.40396954109044586, + "learning_rate": 3.611599153858214e-06, + "loss": 0.7128, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.346954730447735, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6635, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.38781032367239804, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.707, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.42152169571432246, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7352, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.38507703413309846, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6512, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.47196913997467754, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7447, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3575724745654256, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6611, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.3995944350589313, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7024, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4241968511826328, + "learning_rate": 2.590275647868867e-06, + "loss": 0.7842, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.37744754885360143, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7176, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3695777839879396, + "learning_rate": 2.3610579436393e-06, + "loss": 0.689, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5692474674640828, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7333, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.40111766458101306, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7541, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.401526406854631, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7282, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.34518584418627035, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6562, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.38119101383299064, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.7488, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3950947394304229, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6646, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.37298648609688384, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.7561, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.38545179724312784, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6542, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4213751776668678, + "learning_rate": 1.459798471131868e-06, + "loss": 0.7147, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.4113797418789869, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7601, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.38536088528820606, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7049, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.3940209686407352, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.7093, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.38690608167699586, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.7008, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.3637323418643273, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6682, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3613533381200542, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6902, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.3796143995634272, + "learning_rate": 9.070131527609604e-07, + "loss": 0.7431, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.3738193351099153, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6407, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.38988250774522487, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6804, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4016807389934873, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6784, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.48043543790293974, + "learning_rate": 6.496793281141056e-07, + "loss": 0.8607, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.38827723015107324, + "learning_rate": 5.920169059947411e-07, + "loss": 0.778, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.34769342382578017, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6027, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4443948802872273, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7918, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.38797962276906184, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.7203, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3728930396134099, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.7108, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.45712577327358017, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.761, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.3787903003015055, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6748, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3828030831780167, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6948, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4481459645525971, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.7452, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.42796719785108117, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.7581, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.35796128731751464, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6817, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.4597394998169397, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.7275, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4387161435546444, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.767, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.6230864849651305, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7307, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.368425023152609, + "learning_rate": 6.583743778106887e-08, + "loss": 0.643, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.4251114006197662, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7252, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.38524312435789754, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6405, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.38587067921959417, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6866, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5329911655371361, + "learning_rate": 1.209367398504746e-08, + "loss": 0.7437, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.36957638046494906, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7233, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3529599745664591, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6753, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.3222777567675539, + "learning_rate": 0.0, + "loss": 0.6019, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 564799309512704.0, + "train_loss": 0.7624192319869995, + "train_runtime": 9812.0821, + "train_samples_per_second": 1.019, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 564799309512704.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8bf098c7db00ad5f90efb5ddac146a4d2ea3dd9f --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "q_proj", + "down_proj", + "up_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cfa7133f35b40b9c56fcdf92119575cade1bbd2e --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56d4a1fc562e1df7086b8402a70b528b9238517c664afc92a8542c141223eb31 +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..ba4c0c90b5fb5065293dbc49c95107dc7f539abe --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1a420a0c6568db3d9b9e19aceca86a024806831d6ffa900fbf7c2a6243dd8ac +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3ce7be30f414db6f6a5d953fbc46404eaff38e8 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 0.8257624953110868, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.27, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8849018671370342, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4458, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.7509311592129871, + "learning_rate": 3.157894736842105e-05, + "loss": 1.2182, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7137680066316227, + "learning_rate": 4.210526315789474e-05, + "loss": 1.2617, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.7316186299029664, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.3126, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6001527072491968, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0612, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.8492302070359525, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0722, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.450302548161873, + "learning_rate": 8.421052631578948e-05, + "loss": 1.0387, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.6960657836646971, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9636, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.705172266920731, + "learning_rate": 0.00010526315789473685, + "loss": 0.9926, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6120426006524602, + "learning_rate": 0.00011578947368421053, + "loss": 0.9623, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5426022905466087, + "learning_rate": 0.0001263157894736842, + "loss": 0.9781, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.47821872255356507, + "learning_rate": 0.0001368421052631579, + "loss": 0.9036, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5498607304344992, + "learning_rate": 0.00014736842105263158, + "loss": 0.9288, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.4813152944280897, + "learning_rate": 0.00015789473684210527, + "loss": 0.8484, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4279786143269863, + "learning_rate": 0.00016842105263157895, + "loss": 0.8211, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5233559183729891, + "learning_rate": 0.00017894736842105264, + "loss": 0.8525, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5037619024731214, + "learning_rate": 0.00018947368421052632, + "loss": 0.8096, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.47578877531470143, + "learning_rate": 0.0002, + "loss": 0.8462, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.4972768484731378, + "learning_rate": 0.00019999865623437013, + "loss": 0.8986, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.4899022691665234, + "learning_rate": 0.00019999462497359466, + "loss": 0.8565, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5182013476634282, + "learning_rate": 0.00019998790632601496, + "loss": 0.8519, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4885670832653695, + "learning_rate": 0.0001999785004721968, + "loss": 0.8396, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5051081412652413, + "learning_rate": 0.00019996640766492543, + "loss": 0.8486, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.45830988104135423, + "learning_rate": 0.00019995162822919883, + "loss": 0.9069, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4819368762206989, + "learning_rate": 0.00019993416256221895, + "loss": 0.9282, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.46110598549601506, + "learning_rate": 0.00019991401113338104, + "loss": 0.9237, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.454165240480369, + "learning_rate": 0.00019989117448426108, + "loss": 0.8468, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.4710043873377542, + "learning_rate": 0.00019986565322860115, + "loss": 0.8353, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.42572029425373187, + "learning_rate": 0.00019983744805229296, + "loss": 0.8304, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.40729676507467505, + "learning_rate": 0.00019980655971335945, + "loss": 0.7821, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5461850316857276, + "learning_rate": 0.00019977298904193437, + "loss": 0.9009, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5092329216983018, + "learning_rate": 0.00019973673694024, + "loss": 0.872, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4407306898878146, + "learning_rate": 0.00019969780438256293, + "loss": 0.815, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.47477756275372385, + "learning_rate": 0.0001996561924152278, + "loss": 0.9128, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.637199937427722, + "learning_rate": 0.0001996119021565693, + "loss": 0.8611, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.44257844598618723, + "learning_rate": 0.0001995649347969019, + "loss": 0.8562, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3839093232039565, + "learning_rate": 0.00019951529159848805, + "loss": 0.7666, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.38982421254229743, + "learning_rate": 0.00019946297389550433, + "loss": 0.7774, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.4672412257535968, + "learning_rate": 0.00019940798309400526, + "loss": 0.87, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.49388594602208363, + "learning_rate": 0.0001993503206718859, + "loss": 0.9593, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.40004466148897183, + "learning_rate": 0.00019928998817884182, + "loss": 0.7857, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4391729474981486, + "learning_rate": 0.00019922698723632767, + "loss": 0.9029, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4062100678606932, + "learning_rate": 0.00019916131953751342, + "loss": 0.8269, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.4884470051347818, + "learning_rate": 0.00019909298684723904, + "loss": 0.9164, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4487857448368005, + "learning_rate": 0.00019902199100196697, + "loss": 0.7965, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.39765220657903605, + "learning_rate": 0.00019894833390973266, + "loss": 0.7784, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4601633869157618, + "learning_rate": 0.00019887201755009357, + "loss": 0.9308, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4101666126792353, + "learning_rate": 0.0001987930439740757, + "loss": 0.754, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.44548198621900076, + "learning_rate": 0.00019871141530411853, + "loss": 0.856, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5801328126105747, + "learning_rate": 0.0001986271337340182, + "loss": 0.8932, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.36856601944646905, + "learning_rate": 0.00019854020152886814, + "loss": 0.7528, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.45425100785181777, + "learning_rate": 0.0001984506210249986, + "loss": 0.8669, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.49515129914524086, + "learning_rate": 0.00019835839462991361, + "loss": 0.8666, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.449627157260385, + "learning_rate": 0.00019826352482222638, + "loss": 0.8773, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4519274467445987, + "learning_rate": 0.00019816601415159263, + "loss": 0.8933, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5315561619090179, + "learning_rate": 0.0001980658652386421, + "loss": 0.966, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.41488901833456876, + "learning_rate": 0.00019796308077490817, + "loss": 0.7854, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4209767323467047, + "learning_rate": 0.00019785766352275542, + "loss": 0.754, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.4646050150852957, + "learning_rate": 0.00019774961631530545, + "loss": 0.8864, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.40622135118720293, + "learning_rate": 0.00019763894205636072, + "loss": 0.7717, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4105597365968364, + "learning_rate": 0.00019752564372032657, + "loss": 0.8562, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.3976819335123263, + "learning_rate": 0.00019740972435213115, + "loss": 0.8209, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3923890516912274, + "learning_rate": 0.00019729118706714375, + "loss": 0.7461, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.49781743100843345, + "learning_rate": 0.00019717003505109095, + "loss": 0.908, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.3664119530957589, + "learning_rate": 0.00019704627155997108, + "loss": 0.748, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.3868815767533198, + "learning_rate": 0.00019691989991996663, + "loss": 0.7793, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4637771670849159, + "learning_rate": 0.0001967909235273549, + "loss": 0.7995, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5112540673791847, + "learning_rate": 0.00019665934584841682, + "loss": 0.8757, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.5215635020199666, + "learning_rate": 0.00019652517041934356, + "loss": 0.8088, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4175422409943572, + "learning_rate": 0.00019638840084614182, + "loss": 0.7561, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4199343991060303, + "learning_rate": 0.00019624904080453655, + "loss": 0.818, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4286859957616902, + "learning_rate": 0.00019610709403987246, + "loss": 0.8261, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.44104864828984747, + "learning_rate": 0.00019596256436701324, + "loss": 0.8134, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.4898049596218389, + "learning_rate": 0.000195815455670239, + "loss": 0.8684, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.37882329742187837, + "learning_rate": 0.00019566577190314197, + "loss": 0.7615, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4342522279372195, + "learning_rate": 0.0001955135170885202, + "loss": 0.8552, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.39918003097612353, + "learning_rate": 0.00019535869531826937, + "loss": 0.8081, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.3843608742830849, + "learning_rate": 0.00019520131075327298, + "loss": 0.7525, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.41185360675496846, + "learning_rate": 0.00019504136762329047, + "loss": 0.7833, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4712143549738129, + "learning_rate": 0.00019487887022684336, + "loss": 0.8979, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.46496616702516796, + "learning_rate": 0.00019471382293110003, + "loss": 0.8028, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.39756632366171013, + "learning_rate": 0.00019454623017175812, + "loss": 0.7294, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.48898786552510476, + "learning_rate": 0.00019437609645292546, + "loss": 0.8183, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.4650291567120808, + "learning_rate": 0.0001942034263469989, + "loss": 0.8263, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5712177965190297, + "learning_rate": 0.00019402822449454153, + "loss": 0.9212, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.46084933887694657, + "learning_rate": 0.00019385049560415794, + "loss": 0.854, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4287071742279292, + "learning_rate": 0.00019367024445236754, + "loss": 0.7442, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.44196836754754315, + "learning_rate": 0.00019348747588347637, + "loss": 0.8025, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.4323605793101959, + "learning_rate": 0.00019330219480944694, + "loss": 0.8067, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4435088889631693, + "learning_rate": 0.00019311440620976597, + "loss": 0.8735, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4145358036680274, + "learning_rate": 0.0001929241151313108, + "loss": 0.7529, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.4347155628860444, + "learning_rate": 0.00019273132668821364, + "loss": 0.8356, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.44837133567887316, + "learning_rate": 0.00019253604606172417, + "loss": 0.7801, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.37201668165668317, + "learning_rate": 0.00019233827850007027, + "loss": 0.7497, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5081356279683785, + "learning_rate": 0.00019213802931831696, + "loss": 0.8867, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.3938546028533844, + "learning_rate": 0.00019193530389822363, + "loss": 0.7808, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4500008885494135, + "learning_rate": 0.00019173010768809933, + "loss": 0.8128, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4495024223333861, + "learning_rate": 0.0001915224462026563, + "loss": 0.81, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.4406379829690688, + "learning_rate": 0.00019131232502286188, + "loss": 0.7955, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.41826780433074173, + "learning_rate": 0.0001910997497957885, + "loss": 0.8126, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.420674633705996, + "learning_rate": 0.00019088472623446183, + "loss": 0.8429, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.44375081790379906, + "learning_rate": 0.00019066726011770726, + "loss": 0.8415, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4156010025808981, + "learning_rate": 0.0001904473572899947, + "loss": 0.8246, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.4163840175778524, + "learning_rate": 0.00019022502366128135, + "loss": 0.8027, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.42829205970203965, + "learning_rate": 0.00019000026520685302, + "loss": 0.8678, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.5169822667669132, + "learning_rate": 0.0001897730879671634, + "loss": 0.8952, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4115600454299799, + "learning_rate": 0.00018954349804767184, + "loss": 0.8171, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.5459970450426384, + "learning_rate": 0.00018931150161867916, + "loss": 0.9308, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.46653161274206084, + "learning_rate": 0.00018907710491516199, + "loss": 0.8286, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4169817851447276, + "learning_rate": 0.0001888403142366049, + "loss": 0.785, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4296986336280778, + "learning_rate": 0.00018860113594683148, + "loss": 0.7317, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.4598289764406156, + "learning_rate": 0.00018835957647383303, + "loss": 0.8509, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4284789450172835, + "learning_rate": 0.00018811564230959588, + "loss": 0.8109, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.4085105133998958, + "learning_rate": 0.00018786934000992688, + "loss": 0.7483, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4223516139021159, + "learning_rate": 0.00018762067619427746, + "loss": 0.8008, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.4629910355273375, + "learning_rate": 0.00018736965754556528, + "loss": 0.8232, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.422795987366406, + "learning_rate": 0.00018711629080999504, + "loss": 0.7653, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4366821075522493, + "learning_rate": 0.00018686058279687698, + "loss": 0.7955, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.49327921484191617, + "learning_rate": 0.00018660254037844388, + "loss": 0.8061, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4130638414034734, + "learning_rate": 0.00018634217048966637, + "loss": 0.777, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4021659071975698, + "learning_rate": 0.0001860794801280666, + "loss": 0.7052, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.4271748007291228, + "learning_rate": 0.0001858144763535302, + "loss": 0.7365, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.5299687239505216, + "learning_rate": 0.0001855471662881164, + "loss": 0.8153, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.4417088026157466, + "learning_rate": 0.00018527755711586678, + "loss": 0.7942, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.45394229276773096, + "learning_rate": 0.00018500565608261214, + "loss": 0.8192, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.412362902033608, + "learning_rate": 0.00018473147049577774, + "loss": 0.7597, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4007337048627665, + "learning_rate": 0.00018445500772418697, + "loss": 0.7681, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4615639947139987, + "learning_rate": 0.00018417627519786315, + "loss": 0.8859, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.4302940521491683, + "learning_rate": 0.00018389528040783012, + "loss": 0.7851, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.4627462262458623, + "learning_rate": 0.00018361203090591071, + "loss": 0.8236, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.5225423155947857, + "learning_rate": 0.00018332653430452376, + "loss": 0.8811, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4118106570776621, + "learning_rate": 0.00018303879827647975, + "loss": 0.75, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.40334530582040934, + "learning_rate": 0.00018274883055477436, + "loss": 0.7871, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.481257405404212, + "learning_rate": 0.00018245663893238075, + "loss": 0.7831, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.395477348102348, + "learning_rate": 0.00018216223126204007, + "loss": 0.7759, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.47736217256570684, + "learning_rate": 0.00018186561545605054, + "loss": 0.8457, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.41422381746618936, + "learning_rate": 0.00018156679948605467, + "loss": 0.718, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4377367205851379, + "learning_rate": 0.00018126579138282503, + "loss": 0.8147, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.43847669953968815, + "learning_rate": 0.0001809625992360485, + "loss": 0.8322, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.45068408744912125, + "learning_rate": 0.00018065723119410884, + "loss": 0.7826, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.42442234856525457, + "learning_rate": 0.00018034969546386757, + "loss": 0.7558, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.42801746913566524, + "learning_rate": 0.0001800400003104436, + "loss": 0.7609, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4156507051576132, + "learning_rate": 0.00017972815405699103, + "loss": 0.7683, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.4473201587405533, + "learning_rate": 0.00017941416508447536, + "loss": 0.7999, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4450985377903021, + "learning_rate": 0.0001790980418314484, + "loss": 0.8379, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.47962557581652776, + "learning_rate": 0.00017877979279382135, + "loss": 0.8857, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4545841624688034, + "learning_rate": 0.0001784594265246366, + "loss": 0.8338, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4872334190206932, + "learning_rate": 0.0001781369516338378, + "loss": 0.7465, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.4115959905031825, + "learning_rate": 0.00017781237678803847, + "loss": 0.7727, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.42134033238754837, + "learning_rate": 0.000177485710710289, + "loss": 0.8107, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4651537446633174, + "learning_rate": 0.00017715696217984235, + "loss": 0.7605, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.3958512961688566, + "learning_rate": 0.00017682614003191807, + "loss": 0.7908, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.40374748010966977, + "learning_rate": 0.00017649325315746478, + "loss": 0.8042, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.46632059523037944, + "learning_rate": 0.0001761583105029213, + "loss": 0.8747, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4089827861507361, + "learning_rate": 0.00017582132106997616, + "loss": 0.7378, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4204564786385584, + "learning_rate": 0.00017548229391532572, + "loss": 0.7679, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.43135946170039424, + "learning_rate": 0.00017514123815043074, + "loss": 0.7364, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4572562720487233, + "learning_rate": 0.00017479816294127152, + "loss": 0.7828, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.3964215078366468, + "learning_rate": 0.0001744530775081015, + "loss": 0.7606, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4079025985071964, + "learning_rate": 0.0001741059911251997, + "loss": 0.7709, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.41629485307444286, + "learning_rate": 0.000173756913120621, + "loss": 0.804, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.48862714334532076, + "learning_rate": 0.00017340585287594604, + "loss": 0.8566, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5481987062788519, + "learning_rate": 0.0001730528198260285, + "loss": 0.8234, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.42388491477007645, + "learning_rate": 0.00017269782345874203, + "loss": 0.8151, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4278050985828012, + "learning_rate": 0.00017234087331472497, + "loss": 0.8268, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4896370556326024, + "learning_rate": 0.00017198197898712404, + "loss": 0.9108, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.47988687499860594, + "learning_rate": 0.00017162115012133643, + "loss": 0.8804, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.39563609410528994, + "learning_rate": 0.00017125839641475072, + "loss": 0.7929, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.4137808481888729, + "learning_rate": 0.00017089372761648616, + "loss": 0.8035, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.44737224794448993, + "learning_rate": 0.00017052715352713075, + "loss": 0.8587, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4320781272251299, + "learning_rate": 0.00017015868399847768, + "loss": 0.8008, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.5549253939838589, + "learning_rate": 0.00016978832893326074, + "loss": 0.7384, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.40368626607297975, + "learning_rate": 0.00016941609828488807, + "loss": 0.7752, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.3919632231202716, + "learning_rate": 0.0001690420020571747, + "loss": 0.758, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.38846970875015685, + "learning_rate": 0.0001686660503040737, + "loss": 0.7407, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4234695021547494, + "learning_rate": 0.00016828825312940592, + "loss": 0.8041, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.37033350020258665, + "learning_rate": 0.0001679086206865886, + "loss": 0.7239, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.450640661964644, + "learning_rate": 0.00016752716317836229, + "loss": 0.8635, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.414006871374076, + "learning_rate": 0.0001671438908565167, + "loss": 0.8208, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.385575077556316, + "learning_rate": 0.00016675881402161536, + "loss": 0.7393, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.37700313300165783, + "learning_rate": 0.0001663719430227186, + "loss": 0.7203, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.36571802827600036, + "learning_rate": 0.00016598328825710533, + "loss": 0.7634, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.43104127310561624, + "learning_rate": 0.000165592860169994, + "loss": 0.8244, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.4272358408847561, + "learning_rate": 0.00016520066925426144, + "loss": 0.8464, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4224505342065028, + "learning_rate": 0.0001648067260501611, + "loss": 0.7587, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.44320846402142194, + "learning_rate": 0.0001644110411450398, + "loss": 0.7414, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.40422936071607923, + "learning_rate": 0.00016401362517305296, + "loss": 0.7393, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.38470066057426316, + "learning_rate": 0.00016361448881487914, + "loss": 0.7608, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.42680134499154426, + "learning_rate": 0.00016321364279743266, + "loss": 0.8225, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.39199904277864656, + "learning_rate": 0.0001628110978935756, + "loss": 0.7313, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.48430667283851375, + "learning_rate": 0.00016240686492182804, + "loss": 0.7957, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.5008213998300665, + "learning_rate": 0.00016200095474607753, + "loss": 0.8366, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4258478451411289, + "learning_rate": 0.00016159337827528685, + "loss": 0.8014, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.41286270326182095, + "learning_rate": 0.0001611841464632011, + "loss": 0.7777, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4608928926555317, + "learning_rate": 0.0001607732703080532, + "loss": 0.7625, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.37566052898735275, + "learning_rate": 0.00016036076085226814, + "loss": 0.7269, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.40093337371128995, + "learning_rate": 0.0001599466291821666, + "loss": 0.7389, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.4250575628607831, + "learning_rate": 0.0001595308864276666, + "loss": 0.7801, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.38638385236682093, + "learning_rate": 0.0001591135437619847, + "loss": 0.7088, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4490332176805291, + "learning_rate": 0.0001586946124013354, + "loss": 0.8242, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.396049438484832, + "learning_rate": 0.0001582741036046301, + "loss": 0.6977, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.44389641370159033, + "learning_rate": 0.00015785202867317407, + "loss": 0.8626, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4138122349178688, + "learning_rate": 0.00015742839895036305, + "loss": 0.784, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.39957142803660367, + "learning_rate": 0.00015700322582137827, + "loss": 0.7464, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.39044490610642507, + "learning_rate": 0.0001565765207128805, + "loss": 0.7224, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.423854098617967, + "learning_rate": 0.0001561482950927029, + "loss": 0.7765, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.494937095647527, + "learning_rate": 0.00015571856046954285, + "loss": 0.931, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.43593555821026797, + "learning_rate": 0.00015528732839265272, + "loss": 0.7474, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.4180729166268637, + "learning_rate": 0.0001548546104515294, + "loss": 0.7606, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.37792494101350965, + "learning_rate": 0.00015442041827560274, + "loss": 0.7049, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4258499126283775, + "learning_rate": 0.00015398476353392323, + "loss": 0.7707, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.5362628797091166, + "learning_rate": 0.00015354765793484834, + "loss": 0.7658, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.423541897868339, + "learning_rate": 0.00015310911322572753, + "loss": 0.7839, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.4400929463964708, + "learning_rate": 0.000152669141192587, + "loss": 0.8009, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.39823372133247276, + "learning_rate": 0.00015222775365981273, + "loss": 0.7407, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4012724681493264, + "learning_rate": 0.00015178496248983254, + "loss": 0.8122, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.430027401544907, + "learning_rate": 0.00015134077958279765, + "loss": 0.8377, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.3846442643320667, + "learning_rate": 0.00015089521687626243, + "loss": 0.7156, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.41989272346707535, + "learning_rate": 0.000150448286344864, + "loss": 0.7607, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.492464913681311, + "learning_rate": 0.00015000000000000001, + "loss": 0.7787, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.420887246174931, + "learning_rate": 0.00014955036988950618, + "loss": 0.7184, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3921935792428722, + "learning_rate": 0.00014909940809733222, + "loss": 0.6461, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.43121996065043783, + "learning_rate": 0.00014864712674321734, + "loss": 0.8441, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.5256407145540728, + "learning_rate": 0.00014819353798236427, + "loss": 0.8053, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.39102355519678256, + "learning_rate": 0.00014773865400511272, + "loss": 0.7677, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4450017516918292, + "learning_rate": 0.00014728248703661182, + "loss": 0.8406, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.43401607108771595, + "learning_rate": 0.00014682504933649144, + "loss": 0.7895, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.43172550282134836, + "learning_rate": 0.00014636635319853275, + "loss": 0.7656, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.40115710551859224, + "learning_rate": 0.00014590641095033787, + "loss": 0.6914, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.47508107157931734, + "learning_rate": 0.00014544523495299842, + "loss": 0.8685, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4364658292422192, + "learning_rate": 0.0001449828376007636, + "loss": 0.8461, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.405357155813672, + "learning_rate": 0.0001445192313207067, + "loss": 0.7901, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4268868031697948, + "learning_rate": 0.0001440544285723915, + "loss": 0.802, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.4203661553305176, + "learning_rate": 0.00014358844184753712, + "loss": 0.8016, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.44818778300154055, + "learning_rate": 0.00014312128366968243, + "loss": 0.8305, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.40703274262091965, + "learning_rate": 0.00014265296659384956, + "loss": 0.7762, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.38366165022774773, + "learning_rate": 0.00014218350320620624, + "loss": 0.7233, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.39598038159608206, + "learning_rate": 0.0001417129061237278, + "loss": 0.7415, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.3945885969531272, + "learning_rate": 0.00014124118799385796, + "loss": 0.7058, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.43623370184657606, + "learning_rate": 0.00014076836149416887, + "loss": 0.7978, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4204091213025159, + "learning_rate": 0.0001402944393320206, + "loss": 0.8017, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.5289529913397587, + "learning_rate": 0.00013981943424421932, + "loss": 0.7869, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.39563063038232776, + "learning_rate": 0.00013934335899667527, + "loss": 0.7279, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.6646447717718736, + "learning_rate": 0.00013886622638405952, + "loss": 0.7663, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3436166962308719, + "learning_rate": 0.00013838804922946027, + "loss": 0.6877, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.41497417485524535, + "learning_rate": 0.00013790884038403795, + "loss": 0.7728, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3940763188554032, + "learning_rate": 0.00013742861272668012, + "loss": 0.7276, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.40170852358208364, + "learning_rate": 0.00013694737916365517, + "loss": 0.7649, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.3450250773425443, + "learning_rate": 0.00013646515262826552, + "loss": 0.6869, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.45045967476777526, + "learning_rate": 0.0001359819460805001, + "loss": 0.7924, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4170294253003158, + "learning_rate": 0.0001354977725066859, + "loss": 0.7832, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.3984311220870121, + "learning_rate": 0.00013501264491913906, + "loss": 0.7684, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.40637574390925585, + "learning_rate": 0.0001345265763558152, + "loss": 0.7922, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.4092462090318203, + "learning_rate": 0.00013403957987995882, + "loss": 0.7896, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4167534061507475, + "learning_rate": 0.0001335516685797525, + "loss": 0.8078, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3924934029683997, + "learning_rate": 0.00013306285556796495, + "loss": 0.7332, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.39804257535948756, + "learning_rate": 0.00013257315398159864, + "loss": 0.7392, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.39437321700400224, + "learning_rate": 0.00013208257698153677, + "loss": 0.7295, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.4130817180305685, + "learning_rate": 0.00013159113775218964, + "loss": 0.7655, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.46532009398453045, + "learning_rate": 0.00013109884950114007, + "loss": 0.7732, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4421018210383502, + "learning_rate": 0.00013060572545878875, + "loss": 0.7174, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4325475151037246, + "learning_rate": 0.00013011177887799845, + "loss": 0.8001, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3793179799376978, + "learning_rate": 0.00012961702303373795, + "loss": 0.6803, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.39957002001596764, + "learning_rate": 0.00012912147122272523, + "loss": 0.7373, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.49396432276637636, + "learning_rate": 0.00012862513676307008, + "loss": 0.7904, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3966663161210732, + "learning_rate": 0.00012812803299391628, + "loss": 0.7132, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.33985759240020935, + "learning_rate": 0.00012763017327508305, + "loss": 0.6269, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.463495162503486, + "learning_rate": 0.0001271315709867059, + "loss": 0.8521, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.3864811707219409, + "learning_rate": 0.00012663223952887723, + "loss": 0.7731, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.35898489171148906, + "learning_rate": 0.00012613219232128608, + "loss": 0.7166, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3830649830003527, + "learning_rate": 0.00012563144280285741, + "loss": 0.7008, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.39218740129801105, + "learning_rate": 0.00012513000443139112, + "loss": 0.8066, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4545210140307896, + "learning_rate": 0.00012462789068320017, + "loss": 0.8088, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.4001646467869731, + "learning_rate": 0.00012412511505274844, + "loss": 0.7583, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.501760698783621, + "learning_rate": 0.00012362169105228826, + "loss": 0.8184, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.41495984936402913, + "learning_rate": 0.000123117632211497, + "loss": 0.7502, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4429571974157355, + "learning_rate": 0.00012261295207711346, + "loss": 0.7889, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.35297438425001587, + "learning_rate": 0.0001221076642125742, + "loss": 0.7223, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.3853526802627918, + "learning_rate": 0.00012160178219764837, + "loss": 0.7349, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3959742690518756, + "learning_rate": 0.00012109531962807332, + "loss": 0.7553, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.36840329577987396, + "learning_rate": 0.00012058829011518896, + "loss": 0.7237, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.4147780404717559, + "learning_rate": 0.00012008070728557186, + "loss": 0.7562, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3225832480362139, + "learning_rate": 0.00011957258478066931, + "loss": 0.6579, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.4295353448094699, + "learning_rate": 0.00011906393625643244, + "loss": 0.7615, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3775483048845946, + "learning_rate": 0.00011855477538294935, + "loss": 0.6848, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4142142075691693, + "learning_rate": 0.00011804511584407763, + "loss": 0.7828, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5353698126363954, + "learning_rate": 0.00011753497133707679, + "loss": 0.8546, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.3709076345312252, + "learning_rate": 0.00011702435557223987, + "loss": 0.7884, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.38094439266936675, + "learning_rate": 0.00011651328227252517, + "loss": 0.8052, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.3648239047897685, + "learning_rate": 0.00011600176517318741, + "loss": 0.6778, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.37621723942302054, + "learning_rate": 0.00011548981802140848, + "loss": 0.6868, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.37294335429933, + "learning_rate": 0.00011497745457592816, + "loss": 0.6813, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3947357539347734, + "learning_rate": 0.00011446468860667421, + "loss": 0.776, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.42250703638121173, + "learning_rate": 0.00011395153389439233, + "loss": 0.7903, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3653307401093366, + "learning_rate": 0.00011343800423027582, + "loss": 0.7897, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.43225090802331745, + "learning_rate": 0.0001129241134155949, + "loss": 0.7897, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.35987063668403163, + "learning_rate": 0.00011240987526132594, + "loss": 0.7095, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.3703943344110747, + "learning_rate": 0.00011189530358778005, + "loss": 0.7044, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.39396779805761384, + "learning_rate": 0.00011138041222423177, + "loss": 0.7138, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3669713436401558, + "learning_rate": 0.00011086521500854745, + "loss": 0.6887, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4115968677666093, + "learning_rate": 0.00011034972578681338, + "loss": 0.7336, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.4053731358187222, + "learning_rate": 0.00010983395841296348, + "loss": 0.7511, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.383167190850852, + "learning_rate": 0.00010931792674840718, + "loss": 0.7879, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.44544583953871886, + "learning_rate": 0.00010880164466165674, + "loss": 0.7624, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.38762096582354316, + "learning_rate": 0.00010828512602795462, + "loss": 0.7064, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.46154447729862186, + "learning_rate": 0.00010776838472890065, + "loss": 0.7773, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3667986654815652, + "learning_rate": 0.00010725143465207867, + "loss": 0.7164, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.39421575859433117, + "learning_rate": 0.00010673428969068364, + "loss": 0.7849, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.39348479958744115, + "learning_rate": 0.00010621696374314807, + "loss": 0.7057, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3345782623398654, + "learning_rate": 0.00010569947071276847, + "loss": 0.6458, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.38077963460395853, + "learning_rate": 0.00010518182450733186, + "loss": 0.7538, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.39380524756070634, + "learning_rate": 0.00010466403903874176, + "loss": 0.7337, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4027945367600016, + "learning_rate": 0.00010414612822264455, + "loss": 0.7876, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.36455482252741356, + "learning_rate": 0.00010362810597805526, + "loss": 0.7172, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3935296430183471, + "learning_rate": 0.0001031099862269837, + "loss": 0.7782, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4290128530986274, + "learning_rate": 0.00010259178289406011, + "loss": 0.7033, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.43106936236417626, + "learning_rate": 0.00010207350990616107, + "loss": 0.8015, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3895271346888774, + "learning_rate": 0.0001015551811920351, + "loss": 0.7489, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.4430065252654003, + "learning_rate": 0.00010103681068192845, + "loss": 0.8307, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4128390845399857, + "learning_rate": 0.00010051841230721065, + "loss": 0.7193, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.49443405778362715, + "learning_rate": 0.0001, + "loss": 0.8562, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3593474485537418, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7087, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.40412028320168725, + "learning_rate": 9.896318931807155e-05, + "loss": 0.753, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.3951795079899187, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7211, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3367446433383703, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6938, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4841404000315512, + "learning_rate": 9.740821710593989e-05, + "loss": 0.8777, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3851192893210263, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7634, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.5704732313833488, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7353, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.3806526251171886, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7154, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3846988731990761, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7406, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4010843355156052, + "learning_rate": 9.481817549266817e-05, + "loss": 0.8044, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4117946143014912, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7466, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3524936105268302, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6818, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.38073387417606747, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6732, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4046186313227552, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7236, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.36007367962423836, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6734, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.37557697379206506, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7367, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.44575227355425057, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7971, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.3982433187654024, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7411, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.33802865927334874, + "learning_rate": 9.016604158703654e-05, + "loss": 0.6448, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5241892433461196, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7203, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.3795284951339069, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7399, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.37754983890899113, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7293, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.3811119514810814, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7078, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4272118304347965, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7358, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3527172570488226, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7124, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.41975296557179936, + "learning_rate": 8.656199576972423e-05, + "loss": 0.814, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.4366513397993346, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7753, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.36646782412326345, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7218, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.36411771714130664, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7333, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4171863450474228, + "learning_rate": 8.451018197859153e-05, + "loss": 0.7143, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.4589408752697243, + "learning_rate": 8.399823482681262e-05, + "loss": 0.779, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.50266819165461, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7659, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.38642188183332127, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7192, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.41730490029421946, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7533, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.39415503852998696, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7477, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3861078446656998, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7181, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.3831644260946984, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7182, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.39034010397086755, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7445, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.33999132167303014, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6693, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.37322174347140913, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7158, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.46321417604021015, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7518, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.42936758323938373, + "learning_rate": 7.839821780235168e-05, + "loss": 0.8485, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.3753215628295166, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7021, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.35094014140180074, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6345, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.431186522618548, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7225, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.37720378959640954, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6884, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3905684871067712, + "learning_rate": 7.587488494725157e-05, + "loss": 0.741, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.5132491011476759, + "learning_rate": 7.537210931679987e-05, + "loss": 0.836, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.47951055792597375, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7884, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.43948102408392065, + "learning_rate": 7.43685571971426e-05, + "loss": 0.8311, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.48257264189097065, + "learning_rate": 7.386780767871397e-05, + "loss": 0.8758, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4447869671042225, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7378, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.39888307027522596, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7463, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.35485033544439026, + "learning_rate": 7.236982672491698e-05, + "loss": 0.709, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.435385292631864, + "learning_rate": 7.187196700608373e-05, + "loss": 0.856, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3654057961572283, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7277, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3512050443870654, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6231, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.3631995693895185, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7405, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.38125792459638624, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7495, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.37284718216829515, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7397, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.41853111533941406, + "learning_rate": 6.890115049885994e-05, + "loss": 0.8257, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.42212708864466325, + "learning_rate": 6.84088622478104e-05, + "loss": 0.7621, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.4149609259978424, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6828, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3591701977975191, + "learning_rate": 6.742684601840141e-05, + "loss": 0.6771, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4462034755358388, + "learning_rate": 6.693714443203507e-05, + "loss": 0.784, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4389177547070431, + "learning_rate": 6.644833142024751e-05, + "loss": 0.8281, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.39498352550620824, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7069, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.3902459879971352, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7696, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.3841269188816985, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7026, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.38501668513554343, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7246, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.42015739538850205, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7774, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.36397079116853126, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6882, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.39594664298516524, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7239, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.39402405299278287, + "learning_rate": 6.25713872733199e-05, + "loss": 0.76, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3908592872704309, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7243, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3859405443289071, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7062, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4165749502624558, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7745, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.37482765096839027, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7442, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.3460701837878398, + "learning_rate": 6.018056575578075e-05, + "loss": 0.7042, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.38972027239062884, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7041, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.34049194096743063, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6731, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.36566354590299033, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6411, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.36042562161243635, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7209, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.397309512992513, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6715, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.40629926491617097, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7581, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.38356606375229335, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7244, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.4152544313073025, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7651, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.3758133525999123, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7082, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.3816846505519979, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7805, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3965486514163321, + "learning_rate": 5.501716239923642e-05, + "loss": 0.717, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.33240408453613673, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6401, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4175149020063599, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7771, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.3732170038705519, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7214, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4419458585067435, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7804, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.41353646131130006, + "learning_rate": 5.271751296338823e-05, + "loss": 0.7175, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.40766063788271767, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7859, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.35690262967116304, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6778, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.3882922737844192, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6904, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.3275958128104016, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6574, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3716514794615876, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6945, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.39215917011327645, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6868, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4176689926298356, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7971, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.35351206530878865, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.695, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3960789782385614, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7052, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.35111156694312495, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6548, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.39265391171644753, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7084, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.42802899420975543, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7307, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.40932452653354834, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7581, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.3682883000354338, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7211, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.36143839488425517, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6962, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.407685444205195, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.7794, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3914127433539138, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7232, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.32812967570686413, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6575, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.36512818943137537, + "learning_rate": 4.428143953045717e-05, + "loss": 0.7275, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.39077515220878783, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7408, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3697446229432021, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7028, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.38193183440104617, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6518, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.41531624959307795, + "learning_rate": 4.257160104963696e-05, + "loss": 0.742, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.41139871977395037, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7581, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.38161584010920957, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7364, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3749220941999862, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7079, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3777814657377101, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7123, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.35130940006502914, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6171, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.4012010774522812, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7606, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3717235290661236, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7096, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4134247538055933, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7233, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.4055302874678319, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7306, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.3944977507504851, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6487, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.34873206606474894, + "learning_rate": 3.79990452539225e-05, + "loss": 0.7345, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.39267676884304525, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7755, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4053108315648405, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.7137, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4023351316038286, + "learning_rate": 3.678635720256737e-05, + "loss": 0.8173, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.4093808107576682, + "learning_rate": 3.638551118512089e-05, + "loss": 0.7122, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4491558034357714, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7879, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3617404389519696, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7268, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.42799834376001655, + "learning_rate": 3.519327394983888e-05, + "loss": 0.8069, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.3510121740629392, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6819, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.45624590684608124, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7434, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.3698458004495236, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6181, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.37472419134033885, + "learning_rate": 3.362805697728145e-05, + "loss": 0.7097, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3993404575907408, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7013, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3269754679799917, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6265, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.3800469959383693, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7103, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3695595873693324, + "learning_rate": 3.209137931341143e-05, + "loss": 0.7032, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.39376261966416126, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7354, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3801514420197968, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7268, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.35377429778077096, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.727, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.3426628136280171, + "learning_rate": 3.058390171511196e-05, + "loss": 0.684, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.41349331661824823, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7034, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.42756822660149474, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.7443, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.38368861663048015, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6673, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4036097214403057, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7114, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.37126779143968575, + "learning_rate": 2.874160358524931e-05, + "loss": 0.725, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.42227580118022057, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7536, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3589848733378133, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6327, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4447606307607397, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.8961, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.3618392218752661, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6671, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.4017181337738947, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.7277, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.36709172680448393, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6982, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4075043074993376, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7011, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.44322286468878724, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.733, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4160851937190195, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7779, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.4049238728038501, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.743, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4638416235860613, + "learning_rate": 2.485876184956928e-05, + "loss": 0.8496, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.32242176649697163, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6046, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4056506755784108, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6943, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4306554451678464, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7155, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.36470418661099374, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6124, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.3947940275793731, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6759, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4600333734797668, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.7176, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3570800777269948, + "learning_rate": 2.251428928971102e-05, + "loss": 0.7315, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.38644016962962036, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6743, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.392617559246826, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7205, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.45611966255994957, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.7692, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.37146254281751323, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6856, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.38229305346934395, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7541, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.440461568892407, + "learning_rate": 2.058583491552465e-05, + "loss": 0.7624, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.4020066834371898, + "learning_rate": 2.027184594300898e-05, + "loss": 0.7859, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.3492975200839857, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6619, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.9341472418513593, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7237, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4025533441878727, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7634, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3639760736523722, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6507, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.3745761817589749, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6868, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.37884889902728264, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7736, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.365674772684498, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7182, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3624795912730274, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6225, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3683753071921724, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6936, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.35542279663291104, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6602, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.46197904230171544, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7309, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3792773991364782, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.7131, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.43029104336412755, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7544, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.37613372909705506, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.748, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.37723167866393553, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6579, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.5880112976429507, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6757, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.3642343661248838, + "learning_rate": 1.526852950422226e-05, + "loss": 0.7208, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4139194259023456, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.8235, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.37933257982824714, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7123, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.3738416827071965, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6959, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.40217246805698825, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7336, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.35271916282499327, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6936, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3646030235779607, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6937, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.34717235326994916, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6172, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.3566662258185896, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.709, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3333023597264441, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6998, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.37598417466923056, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7131, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.4544822203706908, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6943, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.4272499819402007, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7532, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.4231649747359272, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7626, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.5451685171570724, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7421, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.38091879800214, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6688, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3372074968746882, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6387, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3820980681553104, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7031, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.5064060662251902, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6677, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.4379848712592458, + "learning_rate": 1.045650195232819e-05, + "loss": 0.799, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.36116529315085705, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.666, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3777588973708302, + "learning_rate": 9.999734793146998e-06, + "loss": 0.7148, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4167105277665021, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7685, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.34278184923898475, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6508, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.3698009942965616, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6828, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.40029887541730624, + "learning_rate": 9.115273765538202e-06, + "loss": 0.7863, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3586155213422039, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6712, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.35753596513691044, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6914, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.37166902509689304, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7057, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.38789242252868983, + "learning_rate": 8.269892311900696e-06, + "loss": 0.7205, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3613517774832937, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6579, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.36005293153058765, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6636, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.42523047864521163, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6521, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.3816463390281512, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6912, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.3771823012583285, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6688, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.41737079550125006, + "learning_rate": 7.07588486868922e-06, + "loss": 0.7161, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3765121293994986, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6734, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3693903375454175, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6272, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.339695796508919, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6653, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3919350116426099, + "learning_rate": 6.329755547632499e-06, + "loss": 0.7107, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3270127503285997, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6282, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.33748229524615164, + "learning_rate": 5.971775505458444e-06, + "loss": 0.618, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.40175416378107937, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.741, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.35748115958853127, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6269, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.39710210789544903, + "learning_rate": 5.453769828241872e-06, + "loss": 0.7247, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3354133701651576, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6524, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.35240132550609804, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6588, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3954769971174592, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6755, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.3494405366807235, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6475, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4137297947591601, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7107, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3513366536788848, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6973, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.346812197226684, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.705, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.41841690235705675, + "learning_rate": 4.184544329761009e-06, + "loss": 0.7331, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.4013822537523903, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7293, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.385092970010389, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7544, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3273345381287588, + "learning_rate": 3.750959195463466e-06, + "loss": 0.638, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4025834689258895, + "learning_rate": 3.611599153858214e-06, + "loss": 0.7374, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.34894211940017017, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6825, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.39694560685304675, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7365, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4309222891676764, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7732, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.36303753967921265, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6455, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5312086623336856, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7509, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.36877958151863327, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6938, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.427257036979857, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7845, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.37609962331991664, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6788, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.39459259246211736, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7242, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3960478994548615, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6841, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4472870271167729, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7984, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.4952527410376186, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7035, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4732747771840069, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7197, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.3741639402451771, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7303, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.38366485382899274, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6993, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.38509965539294083, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6775, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.3685855566624197, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6412, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.3717389723254707, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6387, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3865092354235318, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6676, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.35129856196138265, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6856, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4692103617575446, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6423, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.3997876857378165, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.7305, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.36764496849984923, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.656, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.3693734170527303, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6596, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.34281269584844914, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6312, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 1.418254312024875, + "learning_rate": 9.070131527609604e-07, + "loss": 0.757, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.34961298014672315, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6155, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3537304281066106, + "learning_rate": 7.730127636723539e-07, + "loss": 0.5877, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3776264669356993, + "learning_rate": 7.100118211581852e-07, + "loss": 0.7001, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.44861620357104404, + "learning_rate": 6.496793281141056e-07, + "loss": 0.727, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.379647447685291, + "learning_rate": 5.920169059947411e-07, + "loss": 0.7139, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.3318674040023656, + "learning_rate": 5.370261044956971e-07, + "loss": 0.654, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.406704858957328, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7193, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.37829259854556246, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.7246, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.40938567202470044, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6912, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.38616679202871845, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7129, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.4166701096457814, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.7468, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.36679294774649107, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6919, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.43166086429980705, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.7879, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.36798697225067656, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6477, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.38266191831838003, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6782, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.4358544822598908, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.7521, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.45600870802673316, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.8347, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.43877231268351735, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7373, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3427494084909085, + "learning_rate": 6.583743778106887e-08, + "loss": 0.703, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.4026245808095916, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7031, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.3811537659017375, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6928, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.41639384346031383, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7257, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.41962231191955746, + "learning_rate": 1.209367398504746e-08, + "loss": 0.7018, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.40149996237190355, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7354, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3661174120034241, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6987, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.33420017937981583, + "learning_rate": 0.0, + "loss": 0.6408, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 569932943065088.0, + "train_loss": 0.7621833514213562, + "train_runtime": 9858.9358, + "train_samples_per_second": 1.014, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 569932943065088.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6893779a25445c0494928b1ff9d10655c35b451 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "k_proj", + "gate_proj", + "down_proj", + "q_proj", + "o_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d7eb20703d38d43b6e4984121caf4a6e0165c260 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b40c219d5d9b5bec2f4ac07da90e719dbadba67c3be46da50b1d5098d4991dd +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d6d9e259b18437d58b62934611ed505f58dbd8a --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a52359cc6dba98205907cc3a1e177ede41b865bd872a8eab037bc751b153584 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6e630da6d003cec2d4160d27bea00e853158b0a4 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_10000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 0.7231983852482833, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.2218, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9239428360353562, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4363, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.7180615378365476, + "learning_rate": 3.157894736842105e-05, + "loss": 1.191, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6817646092846863, + "learning_rate": 4.210526315789474e-05, + "loss": 1.2504, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.6605178885709505, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2184, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6490829266228314, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0808, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.8602365436315741, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0775, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.538864928859595, + "learning_rate": 8.421052631578948e-05, + "loss": 1.0181, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.7654371127001822, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9833, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.7138691243196663, + "learning_rate": 0.00010526315789473685, + "loss": 1.019, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5886278358027343, + "learning_rate": 0.00011578947368421053, + "loss": 0.9399, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5458193648893132, + "learning_rate": 0.0001263157894736842, + "loss": 0.9659, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5177775238784846, + "learning_rate": 0.0001368421052631579, + "loss": 0.9405, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5251989265716136, + "learning_rate": 0.00014736842105263158, + "loss": 0.8936, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.5170180481311273, + "learning_rate": 0.00015789473684210527, + "loss": 0.9367, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4447650129806105, + "learning_rate": 0.00016842105263157895, + "loss": 0.8518, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5130114045647743, + "learning_rate": 0.00017894736842105264, + "loss": 0.921, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.536096545149244, + "learning_rate": 0.00018947368421052632, + "loss": 0.8237, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.4747624905240158, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.5272007852308388, + "learning_rate": 0.00019999865623437013, + "loss": 0.8823, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.47184598750765694, + "learning_rate": 0.00019999462497359466, + "loss": 0.8982, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.4686585204755349, + "learning_rate": 0.00019998790632601496, + "loss": 0.8416, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5093496630970099, + "learning_rate": 0.0001999785004721968, + "loss": 0.8637, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.44506071615956516, + "learning_rate": 0.00019996640766492543, + "loss": 0.8293, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.45305206064668757, + "learning_rate": 0.00019995162822919883, + "loss": 0.8356, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5162905796803792, + "learning_rate": 0.00019993416256221895, + "loss": 0.9218, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.45194404186666465, + "learning_rate": 0.00019991401113338104, + "loss": 0.8877, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.39989939508442185, + "learning_rate": 0.00019989117448426108, + "loss": 0.8065, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.500024102912701, + "learning_rate": 0.00019986565322860115, + "loss": 0.9477, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.44488308131175786, + "learning_rate": 0.00019983744805229296, + "loss": 0.8545, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.43483669672508, + "learning_rate": 0.00019980655971335945, + "loss": 0.8122, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.45846847202763763, + "learning_rate": 0.00019977298904193437, + "loss": 0.8754, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4703483132913366, + "learning_rate": 0.00019973673694024, + "loss": 0.8578, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4635598888604102, + "learning_rate": 0.00019969780438256293, + "loss": 0.7912, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.4824686288787691, + "learning_rate": 0.0001996561924152278, + "loss": 0.8814, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.43694675557467566, + "learning_rate": 0.0001996119021565693, + "loss": 0.8162, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.425351041096402, + "learning_rate": 0.0001995649347969019, + "loss": 0.7953, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.38452492867769794, + "learning_rate": 0.00019951529159848805, + "loss": 0.7559, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.3702718226298968, + "learning_rate": 0.00019946297389550433, + "loss": 0.727, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.4313030058682959, + "learning_rate": 0.00019940798309400526, + "loss": 0.8624, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.5519558150093004, + "learning_rate": 0.0001993503206718859, + "loss": 0.9452, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4893979262026225, + "learning_rate": 0.00019928998817884182, + "loss": 0.8985, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4458711425408651, + "learning_rate": 0.00019922698723632767, + "loss": 0.8849, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4683453183063352, + "learning_rate": 0.00019916131953751342, + "loss": 0.8542, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.5356719762148601, + "learning_rate": 0.00019909298684723904, + "loss": 1.0009, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4711332636236724, + "learning_rate": 0.00019902199100196697, + "loss": 0.9286, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4273826197745025, + "learning_rate": 0.00019894833390973266, + "loss": 0.8184, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4977211939271013, + "learning_rate": 0.00019887201755009357, + "loss": 0.8839, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4411220100726296, + "learning_rate": 0.0001987930439740757, + "loss": 0.8103, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.442763721455099, + "learning_rate": 0.00019871141530411853, + "loss": 0.84, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.47643441433077544, + "learning_rate": 0.0001986271337340182, + "loss": 0.9073, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4617633420340866, + "learning_rate": 0.00019854020152886814, + "loss": 0.8034, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.4855214192586931, + "learning_rate": 0.0001984506210249986, + "loss": 0.8373, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4657300294379618, + "learning_rate": 0.00019835839462991361, + "loss": 0.8669, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.4383939014328478, + "learning_rate": 0.00019826352482222638, + "loss": 0.8137, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4395112496004527, + "learning_rate": 0.00019816601415159263, + "loss": 0.9085, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5304044430296124, + "learning_rate": 0.0001980658652386421, + "loss": 1.0164, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.3614820299027725, + "learning_rate": 0.00019796308077490817, + "loss": 0.6757, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.45168774647002613, + "learning_rate": 0.00019785766352275542, + "loss": 0.8429, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.480628654761509, + "learning_rate": 0.00019774961631530545, + "loss": 0.9253, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.44062603086934643, + "learning_rate": 0.00019763894205636072, + "loss": 0.8753, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4420390958086642, + "learning_rate": 0.00019752564372032657, + "loss": 0.8654, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.3936872645650932, + "learning_rate": 0.00019740972435213115, + "loss": 0.8097, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4053073352577127, + "learning_rate": 0.00019729118706714375, + "loss": 0.8026, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.47558333628895877, + "learning_rate": 0.00019717003505109095, + "loss": 0.8552, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.41322518957290216, + "learning_rate": 0.00019704627155997108, + "loss": 0.8326, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.40526727949551145, + "learning_rate": 0.00019691989991996663, + "loss": 0.7565, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.43984934584398244, + "learning_rate": 0.0001967909235273549, + "loss": 0.7443, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.5633570704560397, + "learning_rate": 0.00019665934584841682, + "loss": 0.8122, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.4550062994140467, + "learning_rate": 0.00019652517041934356, + "loss": 0.8266, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.44306577214515325, + "learning_rate": 0.00019638840084614182, + "loss": 0.8471, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.460502515247839, + "learning_rate": 0.00019624904080453655, + "loss": 0.8474, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.7040213764246828, + "learning_rate": 0.00019610709403987246, + "loss": 0.8557, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.49032988752370005, + "learning_rate": 0.00019596256436701324, + "loss": 0.8772, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.4347823759443212, + "learning_rate": 0.000195815455670239, + "loss": 0.7704, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4233544696283672, + "learning_rate": 0.00019566577190314197, + "loss": 0.7436, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.47963772898838225, + "learning_rate": 0.0001955135170885202, + "loss": 0.7285, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4087207360263281, + "learning_rate": 0.00019535869531826937, + "loss": 0.8843, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.4044541070129, + "learning_rate": 0.00019520131075327298, + "loss": 0.7651, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.39898291558221216, + "learning_rate": 0.00019504136762329047, + "loss": 0.8058, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4325162026240143, + "learning_rate": 0.00019487887022684336, + "loss": 0.8343, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.44678270740796794, + "learning_rate": 0.00019471382293110003, + "loss": 0.8671, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.45888051318224715, + "learning_rate": 0.00019454623017175812, + "loss": 0.7594, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.44831895893069373, + "learning_rate": 0.00019437609645292546, + "loss": 0.8176, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.41984502622281566, + "learning_rate": 0.0001942034263469989, + "loss": 0.8446, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.41044107463774593, + "learning_rate": 0.00019402822449454153, + "loss": 0.8128, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.44821025270461734, + "learning_rate": 0.00019385049560415794, + "loss": 0.8759, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5167747133762765, + "learning_rate": 0.00019367024445236754, + "loss": 0.8792, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4297095292559342, + "learning_rate": 0.00019348747588347637, + "loss": 0.7773, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.45276162661542235, + "learning_rate": 0.00019330219480944694, + "loss": 0.7908, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.41541240217356085, + "learning_rate": 0.00019311440620976597, + "loss": 0.8016, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3622747630689792, + "learning_rate": 0.0001929241151313108, + "loss": 0.7264, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.40031894800756795, + "learning_rate": 0.00019273132668821364, + "loss": 0.7834, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.48082501114482634, + "learning_rate": 0.00019253604606172417, + "loss": 0.8875, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.35600145331928734, + "learning_rate": 0.00019233827850007027, + "loss": 0.7301, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.45930464641779406, + "learning_rate": 0.00019213802931831696, + "loss": 0.8515, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.41963271587690376, + "learning_rate": 0.00019193530389822363, + "loss": 0.8478, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4500207122798498, + "learning_rate": 0.00019173010768809933, + "loss": 0.8808, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.35756275233876716, + "learning_rate": 0.0001915224462026563, + "loss": 0.7014, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.42620631691726657, + "learning_rate": 0.00019131232502286188, + "loss": 0.8493, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4229131471825618, + "learning_rate": 0.0001910997497957885, + "loss": 0.8046, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4475019357926235, + "learning_rate": 0.00019088472623446183, + "loss": 0.9009, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.4063370550029476, + "learning_rate": 0.00019066726011770726, + "loss": 0.8276, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.48025442429594223, + "learning_rate": 0.0001904473572899947, + "loss": 0.8528, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.41513887163492563, + "learning_rate": 0.00019022502366128135, + "loss": 0.8411, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.39881679931357344, + "learning_rate": 0.00019000026520685302, + "loss": 0.8223, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.46167228602985083, + "learning_rate": 0.0001897730879671634, + "loss": 0.8821, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4348299922553354, + "learning_rate": 0.00018954349804767184, + "loss": 0.8155, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4606340496755724, + "learning_rate": 0.00018931150161867916, + "loss": 0.8852, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.4261363671055583, + "learning_rate": 0.00018907710491516199, + "loss": 0.8074, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.41441921857228664, + "learning_rate": 0.0001888403142366049, + "loss": 0.7537, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.34966187240410096, + "learning_rate": 0.00018860113594683148, + "loss": 0.7092, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.44880769270300125, + "learning_rate": 0.00018835957647383303, + "loss": 0.868, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4300532166432563, + "learning_rate": 0.00018811564230959588, + "loss": 0.8659, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.41459091492252037, + "learning_rate": 0.00018786934000992688, + "loss": 0.7918, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3936699595042386, + "learning_rate": 0.00018762067619427746, + "loss": 0.8224, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.4299016960501296, + "learning_rate": 0.00018736965754556528, + "loss": 0.8132, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3859838782184397, + "learning_rate": 0.00018711629080999504, + "loss": 0.742, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4176535375356266, + "learning_rate": 0.00018686058279687698, + "loss": 0.8144, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.42445455897420875, + "learning_rate": 0.00018660254037844388, + "loss": 0.7717, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.406951765218565, + "learning_rate": 0.00018634217048966637, + "loss": 0.7898, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4129670966436122, + "learning_rate": 0.0001860794801280666, + "loss": 0.7895, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.3990480424499018, + "learning_rate": 0.0001858144763535302, + "loss": 0.7479, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.37947353309371373, + "learning_rate": 0.0001855471662881164, + "loss": 0.7335, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.43753429667586186, + "learning_rate": 0.00018527755711586678, + "loss": 0.7925, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.40563377239425713, + "learning_rate": 0.00018500565608261214, + "loss": 0.8314, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.34805822464650815, + "learning_rate": 0.00018473147049577774, + "loss": 0.6837, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.40339949273873393, + "learning_rate": 0.00018445500772418697, + "loss": 0.7985, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.45478075104512755, + "learning_rate": 0.00018417627519786315, + "loss": 0.8746, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.4020475250433333, + "learning_rate": 0.00018389528040783012, + "loss": 0.8274, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.39769891080606107, + "learning_rate": 0.00018361203090591071, + "loss": 0.7536, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.49319202306759663, + "learning_rate": 0.00018332653430452376, + "loss": 0.9169, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.3482894565665991, + "learning_rate": 0.00018303879827647975, + "loss": 0.7097, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.398915525336025, + "learning_rate": 0.00018274883055477436, + "loss": 0.7959, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.3925875461027216, + "learning_rate": 0.00018245663893238075, + "loss": 0.8095, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3698735981233041, + "learning_rate": 0.00018216223126204007, + "loss": 0.7447, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.45067845618559665, + "learning_rate": 0.00018186561545605054, + "loss": 0.8078, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.41091235860575553, + "learning_rate": 0.00018156679948605467, + "loss": 0.7702, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4440504220821974, + "learning_rate": 0.00018126579138282503, + "loss": 0.8517, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.4538847821745842, + "learning_rate": 0.0001809625992360485, + "loss": 0.8555, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4614655216354403, + "learning_rate": 0.00018065723119410884, + "loss": 0.8675, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3807134184026545, + "learning_rate": 0.00018034969546386757, + "loss": 0.7813, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.485645440042214, + "learning_rate": 0.0001800400003104436, + "loss": 0.8972, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3866526459560267, + "learning_rate": 0.00017972815405699103, + "loss": 0.7519, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.4002641639066821, + "learning_rate": 0.00017941416508447536, + "loss": 0.7843, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4245784518252565, + "learning_rate": 0.0001790980418314484, + "loss": 0.7682, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.49493842111461256, + "learning_rate": 0.00017877979279382135, + "loss": 0.8838, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.42639065046813407, + "learning_rate": 0.0001784594265246366, + "loss": 0.7984, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.44101418425470884, + "learning_rate": 0.0001781369516338378, + "loss": 0.8171, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.38863125402033316, + "learning_rate": 0.00017781237678803847, + "loss": 0.7071, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4402300611993132, + "learning_rate": 0.000177485710710289, + "loss": 0.9214, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3958990999259298, + "learning_rate": 0.00017715696217984235, + "loss": 0.7679, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.3922774962026427, + "learning_rate": 0.00017682614003191807, + "loss": 0.7904, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.38548872401946643, + "learning_rate": 0.00017649325315746478, + "loss": 0.7677, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.4584703334288434, + "learning_rate": 0.0001761583105029213, + "loss": 0.8509, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.37807672222174266, + "learning_rate": 0.00017582132106997616, + "loss": 0.7467, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.3896875121644614, + "learning_rate": 0.00017548229391532572, + "loss": 0.8045, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.422446706378271, + "learning_rate": 0.00017514123815043074, + "loss": 0.7931, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4345092830405177, + "learning_rate": 0.00017479816294127152, + "loss": 0.802, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.4109805837514671, + "learning_rate": 0.0001744530775081015, + "loss": 0.7445, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4456762300372841, + "learning_rate": 0.0001741059911251997, + "loss": 0.8024, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5806850234023688, + "learning_rate": 0.000173756913120621, + "loss": 0.777, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4651593782206959, + "learning_rate": 0.00017340585287594604, + "loss": 0.9195, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4494929684904094, + "learning_rate": 0.0001730528198260285, + "loss": 0.8541, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.40354223512618753, + "learning_rate": 0.00017269782345874203, + "loss": 0.828, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4319419814036804, + "learning_rate": 0.00017234087331472497, + "loss": 0.8353, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4675999844652303, + "learning_rate": 0.00017198197898712404, + "loss": 0.8615, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.43362774505676915, + "learning_rate": 0.00017162115012133643, + "loss": 0.7625, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.43816533532445734, + "learning_rate": 0.00017125839641475072, + "loss": 0.774, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.43426690755805253, + "learning_rate": 0.00017089372761648616, + "loss": 0.8425, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.42823798037894056, + "learning_rate": 0.00017052715352713075, + "loss": 0.7345, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4063338463286235, + "learning_rate": 0.00017015868399847768, + "loss": 0.7897, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.39696713946999873, + "learning_rate": 0.00016978832893326074, + "loss": 0.7894, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.40317349721299145, + "learning_rate": 0.00016941609828488807, + "loss": 0.8119, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.399453657418752, + "learning_rate": 0.0001690420020571747, + "loss": 0.8214, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4265183819901487, + "learning_rate": 0.0001686660503040737, + "loss": 0.7938, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4403751664017105, + "learning_rate": 0.00016828825312940592, + "loss": 0.8429, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.39862194879594887, + "learning_rate": 0.0001679086206865886, + "loss": 0.7645, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4716213034948484, + "learning_rate": 0.00016752716317836229, + "loss": 0.8993, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.40424827442610956, + "learning_rate": 0.0001671438908565167, + "loss": 0.7667, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.39321379967009523, + "learning_rate": 0.00016675881402161536, + "loss": 0.7626, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4324463899318374, + "learning_rate": 0.0001663719430227186, + "loss": 0.7403, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.43587080051042243, + "learning_rate": 0.00016598328825710533, + "loss": 0.8138, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4351378421332776, + "learning_rate": 0.000165592860169994, + "loss": 0.8495, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.42382258870089384, + "learning_rate": 0.00016520066925426144, + "loss": 0.76, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3735845437624687, + "learning_rate": 0.0001648067260501611, + "loss": 0.7217, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.3746451882880628, + "learning_rate": 0.0001644110411450398, + "loss": 0.8167, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3757852868423534, + "learning_rate": 0.00016401362517305296, + "loss": 0.7396, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4193272800347855, + "learning_rate": 0.00016361448881487914, + "loss": 0.7896, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.4247677112289791, + "learning_rate": 0.00016321364279743266, + "loss": 0.8306, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.43674433809018215, + "learning_rate": 0.0001628110978935756, + "loss": 0.8306, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3785698696505638, + "learning_rate": 0.00016240686492182804, + "loss": 0.7203, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.48021686661530827, + "learning_rate": 0.00016200095474607753, + "loss": 0.8875, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3885208439653135, + "learning_rate": 0.00016159337827528685, + "loss": 0.7445, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.3891959958675227, + "learning_rate": 0.0001611841464632011, + "loss": 0.7809, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4056890355515356, + "learning_rate": 0.0001607732703080532, + "loss": 0.6909, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.39978656560334713, + "learning_rate": 0.00016036076085226814, + "loss": 0.7134, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.44775014777529165, + "learning_rate": 0.0001599466291821666, + "loss": 0.7949, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.46304894978869354, + "learning_rate": 0.0001595308864276666, + "loss": 0.855, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.381657877086454, + "learning_rate": 0.0001591135437619847, + "loss": 0.7286, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4196539341868882, + "learning_rate": 0.0001586946124013354, + "loss": 0.83, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.38090185944636157, + "learning_rate": 0.0001582741036046301, + "loss": 0.761, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.45605409999909596, + "learning_rate": 0.00015785202867317407, + "loss": 0.8152, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4032099589518367, + "learning_rate": 0.00015742839895036305, + "loss": 0.7734, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.3893841740881215, + "learning_rate": 0.00015700322582137827, + "loss": 0.7019, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4012209486177639, + "learning_rate": 0.0001565765207128805, + "loss": 0.8093, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3902971475144759, + "learning_rate": 0.0001561482950927029, + "loss": 0.7713, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.402787069157667, + "learning_rate": 0.00015571856046954285, + "loss": 0.7885, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.409791028207148, + "learning_rate": 0.00015528732839265272, + "loss": 0.7777, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.44206483993103474, + "learning_rate": 0.0001548546104515294, + "loss": 0.7605, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.5261416291348644, + "learning_rate": 0.00015442041827560274, + "loss": 0.7468, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.41560987889298096, + "learning_rate": 0.00015398476353392323, + "loss": 0.7397, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.39646930323212953, + "learning_rate": 0.00015354765793484834, + "loss": 0.7449, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4193611673285004, + "learning_rate": 0.00015310911322572753, + "loss": 0.8182, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.42154430598014214, + "learning_rate": 0.000152669141192587, + "loss": 0.8182, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.38976448945002257, + "learning_rate": 0.00015222775365981273, + "loss": 0.8131, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.41060492677429766, + "learning_rate": 0.00015178496248983254, + "loss": 0.8114, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.39989288556293207, + "learning_rate": 0.00015134077958279765, + "loss": 0.7655, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4096185391840245, + "learning_rate": 0.00015089521687626243, + "loss": 0.7672, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.3930935461396084, + "learning_rate": 0.000150448286344864, + "loss": 0.7591, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.46565303187633594, + "learning_rate": 0.00015000000000000001, + "loss": 0.9019, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.39310571443586123, + "learning_rate": 0.00014955036988950618, + "loss": 0.7677, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3904256737304809, + "learning_rate": 0.00014909940809733222, + "loss": 0.779, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.378364991725375, + "learning_rate": 0.00014864712674321734, + "loss": 0.8013, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.4096030420043912, + "learning_rate": 0.00014819353798236427, + "loss": 0.7956, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4007692023729636, + "learning_rate": 0.00014773865400511272, + "loss": 0.7729, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.43339974493101274, + "learning_rate": 0.00014728248703661182, + "loss": 0.8337, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4530321071927405, + "learning_rate": 0.00014682504933649144, + "loss": 0.8465, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.4278293397766571, + "learning_rate": 0.00014636635319853275, + "loss": 0.7763, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.41652180547380624, + "learning_rate": 0.00014590641095033787, + "loss": 0.7509, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.4731251149617755, + "learning_rate": 0.00014544523495299842, + "loss": 0.8658, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4446056116287485, + "learning_rate": 0.0001449828376007636, + "loss": 0.7714, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.3953730552417552, + "learning_rate": 0.0001445192313207067, + "loss": 0.7186, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4518617483617707, + "learning_rate": 0.0001440544285723915, + "loss": 0.8157, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.42051458183639395, + "learning_rate": 0.00014358844184753712, + "loss": 0.797, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4282040460541697, + "learning_rate": 0.00014312128366968243, + "loss": 0.7773, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.3981659644642087, + "learning_rate": 0.00014265296659384956, + "loss": 0.7925, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.38031682467988365, + "learning_rate": 0.00014218350320620624, + "loss": 0.7004, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.378734742002174, + "learning_rate": 0.0001417129061237278, + "loss": 0.7099, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.40566754079083356, + "learning_rate": 0.00014124118799385796, + "loss": 0.7416, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4285863769040255, + "learning_rate": 0.00014076836149416887, + "loss": 0.7839, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4486971547349298, + "learning_rate": 0.0001402944393320206, + "loss": 0.7981, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.401294715482731, + "learning_rate": 0.00013981943424421932, + "loss": 0.757, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.429532769909926, + "learning_rate": 0.00013934335899667527, + "loss": 0.7652, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.4078703061524831, + "learning_rate": 0.00013886622638405952, + "loss": 0.7723, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.38395637745009437, + "learning_rate": 0.00013838804922946027, + "loss": 0.7701, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.46608786737309366, + "learning_rate": 0.00013790884038403795, + "loss": 0.7662, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.35045160988081664, + "learning_rate": 0.00013742861272668012, + "loss": 0.7096, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.37791039178233876, + "learning_rate": 0.00013694737916365517, + "loss": 0.7667, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.3701446044366339, + "learning_rate": 0.00013646515262826552, + "loss": 0.7505, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.4072252118024341, + "learning_rate": 0.0001359819460805001, + "loss": 0.762, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5446744111289362, + "learning_rate": 0.0001354977725066859, + "loss": 0.7562, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.39802460932591816, + "learning_rate": 0.00013501264491913906, + "loss": 0.7107, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.41490940600074233, + "learning_rate": 0.0001345265763558152, + "loss": 0.812, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.382564640772057, + "learning_rate": 0.00013403957987995882, + "loss": 0.7406, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.36688101890037983, + "learning_rate": 0.0001335516685797525, + "loss": 0.7235, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.37758267877410473, + "learning_rate": 0.00013306285556796495, + "loss": 0.7257, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4092480685877459, + "learning_rate": 0.00013257315398159864, + "loss": 0.7844, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.4344992173838129, + "learning_rate": 0.00013208257698153677, + "loss": 0.7703, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.40598239590824653, + "learning_rate": 0.00013159113775218964, + "loss": 0.7686, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4034797257147384, + "learning_rate": 0.00013109884950114007, + "loss": 0.8054, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.47011639450673637, + "learning_rate": 0.00013060572545878875, + "loss": 0.8405, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.44187715596052, + "learning_rate": 0.00013011177887799845, + "loss": 0.8516, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3540415585880858, + "learning_rate": 0.00012961702303373795, + "loss": 0.6861, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.3858217580819322, + "learning_rate": 0.00012912147122272523, + "loss": 0.7518, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4014913505425734, + "learning_rate": 0.00012862513676307008, + "loss": 0.7743, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3483869054788003, + "learning_rate": 0.00012812803299391628, + "loss": 0.7343, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3877059266312812, + "learning_rate": 0.00012763017327508305, + "loss": 0.7729, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.4427999619091166, + "learning_rate": 0.0001271315709867059, + "loss": 0.9003, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.3669647698336677, + "learning_rate": 0.00012663223952887723, + "loss": 0.7311, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.37632648115591405, + "learning_rate": 0.00012613219232128608, + "loss": 0.7328, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.38890322786941506, + "learning_rate": 0.00012563144280285741, + "loss": 0.7263, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3939325748769751, + "learning_rate": 0.00012513000443139112, + "loss": 0.7516, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.45711681649704555, + "learning_rate": 0.00012462789068320017, + "loss": 0.7958, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.3906650106035462, + "learning_rate": 0.00012412511505274844, + "loss": 0.7286, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3890071506512066, + "learning_rate": 0.00012362169105228826, + "loss": 0.7403, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.41111095214733645, + "learning_rate": 0.000123117632211497, + "loss": 0.7467, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.44937322110097283, + "learning_rate": 0.00012261295207711346, + "loss": 0.7609, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.37919580505064715, + "learning_rate": 0.0001221076642125742, + "loss": 0.7705, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.37064227528614685, + "learning_rate": 0.00012160178219764837, + "loss": 0.7247, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3764518651002971, + "learning_rate": 0.00012109531962807332, + "loss": 0.7376, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3535946848882062, + "learning_rate": 0.00012058829011518896, + "loss": 0.7168, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3813621088573542, + "learning_rate": 0.00012008070728557186, + "loss": 0.7342, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3269156299946925, + "learning_rate": 0.00011957258478066931, + "loss": 0.6725, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.3946878545722184, + "learning_rate": 0.00011906393625643244, + "loss": 0.7833, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.41330638203636794, + "learning_rate": 0.00011855477538294935, + "loss": 0.7286, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4361119728409083, + "learning_rate": 0.00011804511584407763, + "loss": 0.6967, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4183377608817122, + "learning_rate": 0.00011753497133707679, + "loss": 0.7852, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.3953212306940315, + "learning_rate": 0.00011702435557223987, + "loss": 0.7396, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.40560920084607494, + "learning_rate": 0.00011651328227252517, + "loss": 0.8026, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.35981353270429806, + "learning_rate": 0.00011600176517318741, + "loss": 0.7193, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.35709845495917836, + "learning_rate": 0.00011548981802140848, + "loss": 0.6829, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.3539121121049415, + "learning_rate": 0.00011497745457592816, + "loss": 0.6592, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4014672698806347, + "learning_rate": 0.00011446468860667421, + "loss": 0.8046, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.4011937126204357, + "learning_rate": 0.00011395153389439233, + "loss": 0.7273, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3628078110971893, + "learning_rate": 0.00011343800423027582, + "loss": 0.7352, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.40435074788644143, + "learning_rate": 0.0001129241134155949, + "loss": 0.8142, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3992911802650674, + "learning_rate": 0.00011240987526132594, + "loss": 0.746, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.35279408055295663, + "learning_rate": 0.00011189530358778005, + "loss": 0.7018, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.38884407535041665, + "learning_rate": 0.00011138041222423177, + "loss": 0.7411, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3658038746256643, + "learning_rate": 0.00011086521500854745, + "loss": 0.7154, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3958013197872446, + "learning_rate": 0.00011034972578681338, + "loss": 0.789, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.38465550758524736, + "learning_rate": 0.00010983395841296348, + "loss": 0.7832, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.38273579054099477, + "learning_rate": 0.00010931792674840718, + "loss": 0.7501, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.4766554804629017, + "learning_rate": 0.00010880164466165674, + "loss": 0.8083, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4507399064891884, + "learning_rate": 0.00010828512602795462, + "loss": 0.7602, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.39948409245215666, + "learning_rate": 0.00010776838472890065, + "loss": 0.7124, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.36978831021665176, + "learning_rate": 0.00010725143465207867, + "loss": 0.7425, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3596715994910583, + "learning_rate": 0.00010673428969068364, + "loss": 0.6993, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 1.0129120421227351, + "learning_rate": 0.00010621696374314807, + "loss": 0.7103, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.37404697027334083, + "learning_rate": 0.00010569947071276847, + "loss": 0.67, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4063613452265822, + "learning_rate": 0.00010518182450733186, + "loss": 0.7125, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4311622529148588, + "learning_rate": 0.00010466403903874176, + "loss": 0.805, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4002862057086655, + "learning_rate": 0.00010414612822264455, + "loss": 0.7189, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.35897131829240503, + "learning_rate": 0.00010362810597805526, + "loss": 0.7231, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3840002215719583, + "learning_rate": 0.0001031099862269837, + "loss": 0.7237, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3915382457016555, + "learning_rate": 0.00010259178289406011, + "loss": 0.7846, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4264033461354839, + "learning_rate": 0.00010207350990616107, + "loss": 0.8394, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.39704423994334814, + "learning_rate": 0.0001015551811920351, + "loss": 0.7571, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.3937382052302273, + "learning_rate": 0.00010103681068192845, + "loss": 0.6856, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4367566088275942, + "learning_rate": 0.00010051841230721065, + "loss": 0.7608, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.40495694712817476, + "learning_rate": 0.0001, + "loss": 0.7726, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.38098392436937367, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7083, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.38733430226751053, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7439, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.44064750233715166, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7715, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.35230373552238536, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6796, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4800720914552111, + "learning_rate": 9.740821710593989e-05, + "loss": 0.8364, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3805193230557943, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7104, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.5085183994179607, + "learning_rate": 9.637189402194476e-05, + "loss": 0.8648, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.37929461103153767, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7171, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3800242130516676, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7054, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4336901328153918, + "learning_rate": 9.481817549266817e-05, + "loss": 0.783, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3868825632199866, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7272, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.35573721294798893, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6274, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.3877579169958648, + "learning_rate": 9.326571030931637e-05, + "loss": 0.715, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.45314018920459403, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7513, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.38336563912064303, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7191, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.400980502516487, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7804, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.39925637895047644, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7971, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.3841983259664286, + "learning_rate": 9.068207325159284e-05, + "loss": 0.663, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.36287443046027357, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7876, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5084040575959375, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7187, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.33765714798202046, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6758, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4156605117410061, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7416, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.3873188462882457, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7115, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.46084313761255724, + "learning_rate": 8.759012473867407e-05, + "loss": 0.7266, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3760503391250545, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7615, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4141877976256147, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7584, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.39944168824261084, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7102, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.36133519899954275, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7132, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3944336816628434, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7597, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4129981634077143, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6973, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3980172989191709, + "learning_rate": 8.399823482681262e-05, + "loss": 0.7344, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4187340354016876, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7642, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.43473077350523287, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7718, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.42056656040527896, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7257, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.4183719668101969, + "learning_rate": 8.195488415592238e-05, + "loss": 0.701, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3645065233849404, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7366, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.4036044636169178, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7955, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.37323141840597074, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6729, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.36364003272999235, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6943, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.372790405258328, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7192, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.39078633590371337, + "learning_rate": 7.89046803719267e-05, + "loss": 0.7936, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.40759819985709317, + "learning_rate": 7.839821780235168e-05, + "loss": 0.75, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.3533679152001031, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6894, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4035765518667972, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7355, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.4178837302576139, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7373, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.47558833242507675, + "learning_rate": 7.637830894771175e-05, + "loss": 0.763, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.46500805545229223, + "learning_rate": 7.587488494725157e-05, + "loss": 0.8097, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.46002421900473145, + "learning_rate": 7.537210931679987e-05, + "loss": 0.875, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.40789734231352603, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7226, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.4113445916236749, + "learning_rate": 7.43685571971426e-05, + "loss": 0.7658, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4174838902603154, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7696, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.41789225079157627, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7765, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.39444238581947505, + "learning_rate": 7.286842901329412e-05, + "loss": 0.806, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3596258677892891, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7327, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.40451250926454907, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7671, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.37260153134251817, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7026, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.36730061022340127, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7141, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.37221569937989507, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7274, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.5459296283985436, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7002, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.38125020228424533, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6742, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4448395502917699, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7735, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3836625050535906, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6939, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.41170316536268653, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7362, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3720648756073222, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7185, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.43163359631403647, + "learning_rate": 6.693714443203507e-05, + "loss": 0.8141, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.49092374396968624, + "learning_rate": 6.644833142024751e-05, + "loss": 0.7933, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.41203465556681096, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7825, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.34874330169641377, + "learning_rate": 6.547342364418481e-05, + "loss": 0.685, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.37734527159453984, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6724, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3834950796125026, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6698, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.3758045774328492, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7996, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3351876872133651, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6819, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.42877069393958817, + "learning_rate": 6.305262083634488e-05, + "loss": 0.8079, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3813152540171064, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7481, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.35937473307719536, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6833, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.39170839948506153, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7506, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.3845078779013667, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7419, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.38223550799166633, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7592, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.3483973783122578, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6416, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.355210671910389, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7059, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3394912634882092, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6609, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.34382031484106235, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6563, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.3887889453512105, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7017, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.472045409910418, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7268, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.442285904294921, + "learning_rate": 5.73470334061505e-05, + "loss": 0.8207, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3741632001636999, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7306, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.3359492821294262, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.642, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.3792909475977315, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6988, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4091566828975077, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7251, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3853490829736871, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7023, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3625820930850993, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6429, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4418338332757277, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7317, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.4224791555196694, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7059, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4038881731645075, + "learning_rate": 5.31749506635086e-05, + "loss": 0.7844, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3976998105956467, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6983, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4061098778627193, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7284, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.35198021454890427, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6564, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.3909180160479183, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6891, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.37386668018924474, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7024, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.36816918010672717, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7013, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.44561999090184584, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7692, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.38911370515307514, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6697, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.3765715958260219, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6921, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.38011344689911836, + "learning_rate": 4.865922041720239e-05, + "loss": 0.7282, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4167767279838049, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7097, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3756370256031805, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7363, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.3493406120688842, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6947, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.4134756820671265, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7831, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.3595003574635689, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7227, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.36324495205571183, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6952, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.411860655301704, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.8135, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.45684704308654817, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7334, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.38234672207344367, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7387, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3999340583677459, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6767, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.41152430409945573, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7795, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4038304045299731, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7319, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.40001256015158243, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.7277, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.36390171030656865, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7553, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.41043851211960763, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7441, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.40917804053191087, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7092, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3883299801449483, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6696, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 1.0963461420450433, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6994, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.3568816293560058, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6823, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.4023538231843036, + "learning_rate": 4.00533708178334e-05, + "loss": 0.7629, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.40436434352028394, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7225, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3878036814885285, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7552, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3631674545513451, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6771, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.35252670588368107, + "learning_rate": 3.840662172471315e-05, + "loss": 0.5971, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3712317861407566, + "learning_rate": 3.79990452539225e-05, + "loss": 0.7034, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.39408335238973335, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7398, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.3532409220108127, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6397, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.36451098268841636, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7524, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.40477922284122053, + "learning_rate": 3.638551118512089e-05, + "loss": 0.7629, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.40274299719829526, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7409, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3909281065215058, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7027, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4245347694176414, + "learning_rate": 3.519327394983888e-05, + "loss": 0.7496, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.37669925736047244, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6715, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.44194636784462143, + "learning_rate": 3.440713983000601e-05, + "loss": 0.718, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.33576723517865975, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7073, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3526403484194062, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6737, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.40613972281563926, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6804, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3652644263225935, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6482, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.44840540807785684, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7368, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.37569432333962294, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6399, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.41847764928324893, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.7856, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3913727858664601, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7074, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.36295695446323223, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6346, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.37023749610856527, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6911, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.4668246705267103, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7567, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3715919476936774, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6404, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.35973638741996034, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.662, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.36509820892589967, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6892, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.46598374458522496, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7504, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.42417348938962923, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7465, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3502086934855863, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.5907, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.43110347939361826, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7651, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.39398513650777234, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.7541, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.3770624916644866, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.7266, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.39499788836012734, + "learning_rate": 2.659414712405398e-05, + "loss": 0.7463, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3672763525548306, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6447, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.39066711504394686, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7406, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.43563331580206266, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7901, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.40190328232436373, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7205, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.42431966071154587, + "learning_rate": 2.485876184956928e-05, + "loss": 0.7547, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3560331931544561, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6801, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3377482649836737, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6433, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4109657873465769, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.8159, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.38005015654533436, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7088, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5488690593328647, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7626, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3962249485454088, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.723, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.419498559865438, + "learning_rate": 2.251428928971102e-05, + "loss": 0.7227, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.41360813064611796, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7207, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.37988686552506207, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6963, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.5041206543316924, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.8632, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3612199502341204, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.7003, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3751721903936928, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.704, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.38764348543483956, + "learning_rate": 2.058583491552465e-05, + "loss": 0.685, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.39281159642279473, + "learning_rate": 2.027184594300898e-05, + "loss": 0.7214, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.34209994426755214, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6792, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.40163707214933136, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7288, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.41573749121963943, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7534, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.391848400237993, + "learning_rate": 1.903740076395151e-05, + "loss": 0.7321, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.43311391958987744, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7086, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4536007636519588, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7244, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.3652130722470363, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6782, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.31065133138155593, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6423, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3348317769014843, + "learning_rate": 1.754336106761927e-05, + "loss": 0.611, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.3989098217381608, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7136, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.4126240586209762, + "learning_rate": 1.696120172352025e-05, + "loss": 0.7518, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.36563825916354054, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6462, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4101710514975283, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7773, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4104198614977016, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6886, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.5832241207784147, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7153, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.42754824152447896, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.7257, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.36928201720825976, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6139, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3974920487783144, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7925, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3488093554801746, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.67, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.4081500958560575, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.7698, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3510043183393745, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6505, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.40969858220650185, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7218, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3533872161820804, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.7261, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3693744283782947, + "learning_rate": 1.339745962155613e-05, + "loss": 0.5911, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.3768413564202193, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.7112, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3298457775194949, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6377, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.38690638394379856, + "learning_rate": 1.263034245443473e-05, + "loss": 0.7077, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.37086823716999645, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.7148, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.42330824488193325, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.7534, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.48300690616154035, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6976, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.3900694708123389, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6809, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3784642140178504, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7073, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3732175647059916, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6731, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3873008012047257, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7025, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.3844093543014457, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6612, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.41060350831625075, + "learning_rate": 1.045650195232819e-05, + "loss": 0.7457, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.4096525505586372, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6801, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4007266962985699, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6904, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4197327497544567, + "learning_rate": 9.774976338718677e-06, + "loss": 0.7528, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.35904867567271265, + "learning_rate": 9.552642710005299e-06, + "loss": 0.7328, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.40276001415498663, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6754, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3813333734708716, + "learning_rate": 9.115273765538202e-06, + "loss": 0.7141, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.5093889871823375, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6977, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.400132373769689, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6798, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.37453588167409657, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6905, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.445413282690777, + "learning_rate": 8.269892311900696e-06, + "loss": 0.8455, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.40467766806413463, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7624, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.37321198097224145, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6554, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.3677412242083649, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6719, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.36164040774173506, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6459, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.3650823538229352, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.707, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3804371610295385, + "learning_rate": 7.07588486868922e-06, + "loss": 0.7594, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.44106616411053606, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6696, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3324267407818426, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6161, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.37600498420808864, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6506, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.39884051843359386, + "learning_rate": 6.329755547632499e-06, + "loss": 0.7362, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.366487353854839, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6605, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.36411247167166455, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6871, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.4065998256016423, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7495, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.41231735691420857, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6689, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.3854212616522437, + "learning_rate": 5.453769828241872e-06, + "loss": 0.7021, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3392210742716694, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6756, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.35168297042146535, + "learning_rate": 5.121129773156663e-06, + "loss": 0.627, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.38103576987056464, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6984, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.35298484224758003, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6598, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.41249906064070707, + "learning_rate": 4.641304681730641e-06, + "loss": 0.7793, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.40212432590182495, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6987, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3287284941233722, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6385, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.41328744455408084, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6854, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.3820446954303476, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7051, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.40956797346664336, + "learning_rate": 3.892905960127546e-06, + "loss": 0.7485, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3682969904631875, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6781, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4012523688681302, + "learning_rate": 3.611599153858214e-06, + "loss": 0.717, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.34306295120439945, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6434, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.41628562886755166, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7294, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4366815833874182, + "learning_rate": 3.209076472645112e-06, + "loss": 0.7411, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.34382043328844686, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6047, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.42835073132185475, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.7502, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.37914174525666405, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6901, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.37645121302077367, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7065, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.38088786213147147, + "learning_rate": 2.590275647868867e-06, + "loss": 0.7029, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.39503162545070064, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7145, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3905990159592029, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7125, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4200098993675833, + "learning_rate": 2.250383684694579e-06, + "loss": 0.7683, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.4029202613775148, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7574, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.40597596536313096, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7061, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4895979686581715, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.756, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.35978484679119643, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.7009, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.33757437669375695, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6475, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.3608122154310713, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6549, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.33851915578617664, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6751, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.393001219191284, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6747, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.3802930029455698, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6351, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3668943160030006, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7399, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.5284019200681044, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6583, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3672083835934735, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6405, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.38329428540247795, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.7362, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3496210739665151, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6358, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.40403696037973136, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6847, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.4048348645650435, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6897, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.33153420780396853, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6138, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.39677315949372843, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6974, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.4255411496687473, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7638, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.44715029336801493, + "learning_rate": 5.920169059947411e-07, + "loss": 0.734, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.3823527796727132, + "learning_rate": 5.370261044956971e-07, + "loss": 0.69, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.41897928091944026, + "learning_rate": 4.847084015119574e-07, + "loss": 0.8015, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.3914526389133937, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.742, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4178354504278549, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6297, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.40336635899089013, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7265, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.3718102945561971, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6742, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3737535685050714, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.7229, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.42769307916587873, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.7555, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3577862820441812, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6454, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.41318583087110927, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.7695, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.47977922845908477, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.7193, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.42266778140194194, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.8069, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.38078422349478713, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6981, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3477323908052074, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6715, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.5458686512071936, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7217, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.4146727887590274, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7601, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4082313641603639, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7122, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.39072806435910845, + "learning_rate": 1.209367398504746e-08, + "loss": 0.7094, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.40020469038001116, + "learning_rate": 5.375026405352035e-09, + "loss": 0.73, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3910437689979269, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.7462, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.3666639480621862, + "learning_rate": 0.0, + "loss": 0.6875, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 571506442403840.0, + "train_loss": 0.7636905529975891, + "train_runtime": 9780.3316, + "train_samples_per_second": 1.022, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 571506442403840.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..98918016adb9851e1c5ca954526bb545a66ee1da --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "q_proj", + "gate_proj", + "o_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1432842b2ee569a073300a4d1387238800738fba --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b5fd0cc5001f6e6cc9d42b52851474bbb03ef5d4b0a049aec1540a8b7ea3f7 +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..84294b6576d96199d8126d4fc0ce6ee6e4f2d304 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebbbff62258d6295b0d9016e1c944413d92bc04f1cf1a11cf8a750841d39401f +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4141d91c7ffae0051e6382b17e734540bf4e67ce --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.7770269714650887, + "learning_rate": 5.263157894736842e-06, + "loss": 1.2881, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 1.024047044935287, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.2967, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 0.9179324969023009, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.3872, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8298904257412587, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3031, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.7730459391219203, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.2892, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.6600669344799063, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1634, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.8296588219702797, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.2471, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6964921879934483, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1325, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.6118135227381648, + "learning_rate": 4.736842105263158e-05, + "loss": 1.0626, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 1.121380634255772, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0503, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.7396748278621545, + "learning_rate": 5.789473684210527e-05, + "loss": 1.0111, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6249189450745211, + "learning_rate": 6.31578947368421e-05, + "loss": 0.9344, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.6499687655104689, + "learning_rate": 6.842105263157895e-05, + "loss": 0.9215, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5076935214653521, + "learning_rate": 7.368421052631579e-05, + "loss": 0.8516, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.6888973666116883, + "learning_rate": 7.894736842105263e-05, + "loss": 0.99, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5591842011521481, + "learning_rate": 8.421052631578948e-05, + "loss": 0.906, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.5321833673418794, + "learning_rate": 8.947368421052632e-05, + "loss": 0.8883, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.48128609176908127, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8505, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.51363735744202, + "learning_rate": 0.0001, + "loss": 0.9719, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.5957002621267714, + "learning_rate": 0.00010526315789473685, + "loss": 1.0021, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.5163578810666871, + "learning_rate": 0.0001105263157894737, + "loss": 0.8585, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.47777288535864215, + "learning_rate": 0.00011578947368421053, + "loss": 0.8385, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5330569062110371, + "learning_rate": 0.00012105263157894738, + "loss": 1.0255, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5980756167605328, + "learning_rate": 0.0001263157894736842, + "loss": 0.9987, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.5206638044866894, + "learning_rate": 0.00013157894736842108, + "loss": 0.9226, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5487650960666887, + "learning_rate": 0.0001368421052631579, + "loss": 0.9099, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.49145451533617074, + "learning_rate": 0.00014210526315789474, + "loss": 0.8579, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.47189502298098335, + "learning_rate": 0.00014736842105263158, + "loss": 0.8676, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.5080289216879348, + "learning_rate": 0.00015263157894736845, + "loss": 0.895, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.5226289806694986, + "learning_rate": 0.00015789473684210527, + "loss": 0.9314, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.4457264336120691, + "learning_rate": 0.0001631578947368421, + "loss": 0.8269, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4700824414453813, + "learning_rate": 0.00016842105263157895, + "loss": 0.8755, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.5235401864352286, + "learning_rate": 0.0001736842105263158, + "loss": 0.9357, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5026293819871382, + "learning_rate": 0.00017894736842105264, + "loss": 0.9186, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.5855558657957484, + "learning_rate": 0.00018421052631578948, + "loss": 0.8541, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.48471644560507177, + "learning_rate": 0.00018947368421052632, + "loss": 0.9137, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.46478434248194045, + "learning_rate": 0.00019473684210526317, + "loss": 0.8311, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.40363878205922327, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.4557241944006385, + "learning_rate": 0.00019999966405802826, + "loss": 0.9019, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.5136938589333343, + "learning_rate": 0.00019999865623437013, + "loss": 0.9092, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.4589064324372691, + "learning_rate": 0.00019999697653579705, + "loss": 0.8153, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.4711520806636325, + "learning_rate": 0.00019999462497359466, + "loss": 0.8642, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.4642243410765449, + "learning_rate": 0.0001999916015635627, + "loss": 0.8678, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.47418870887286674, + "learning_rate": 0.00019998790632601496, + "loss": 0.8437, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.47714001354788554, + "learning_rate": 0.00019998353928577919, + "loss": 0.8387, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4084594735562426, + "learning_rate": 0.0001999785004721968, + "loss": 0.8178, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.43334073220477404, + "learning_rate": 0.0001999727899191228, + "loss": 0.7866, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4094875104431594, + "learning_rate": 0.00019996640766492543, + "loss": 0.737, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.49985762919942306, + "learning_rate": 0.00019995935375248606, + "loss": 0.8179, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.44413007504680635, + "learning_rate": 0.00019995162822919883, + "loss": 0.7888, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.47530439086714704, + "learning_rate": 0.00019994323114697022, + "loss": 0.7823, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.46994207893111456, + "learning_rate": 0.00019993416256221895, + "loss": 0.8512, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.5374106034782066, + "learning_rate": 0.0001999244225358753, + "loss": 0.8281, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4634713085884633, + "learning_rate": 0.00019991401113338104, + "loss": 0.816, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.4990707735023886, + "learning_rate": 0.00019990292842468868, + "loss": 0.8629, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4823910263411063, + "learning_rate": 0.00019989117448426108, + "loss": 0.8284, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.40558494647973825, + "learning_rate": 0.0001998787493910712, + "loss": 0.7613, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5058399519984393, + "learning_rate": 0.00019986565322860115, + "loss": 0.9216, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.4430971022831468, + "learning_rate": 0.000199851886084842, + "loss": 0.8424, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.4599605477233072, + "learning_rate": 0.00019983744805229296, + "loss": 0.9151, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.41773726348192974, + "learning_rate": 0.00019982233922796085, + "loss": 0.802, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5273326132401231, + "learning_rate": 0.00019980655971335945, + "loss": 0.9706, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.4346491260010976, + "learning_rate": 0.00019979010961450878, + "loss": 0.8258, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.439233301783603, + "learning_rate": 0.00019977298904193437, + "loss": 0.7435, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.5408845811493046, + "learning_rate": 0.00019975519811066663, + "loss": 0.9646, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.3839446160384672, + "learning_rate": 0.00019973673694024, + "loss": 0.7484, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.4663067629148098, + "learning_rate": 0.0001997176056546921, + "loss": 0.838, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4811492144146111, + "learning_rate": 0.00019969780438256293, + "loss": 0.872, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.45118193674818063, + "learning_rate": 0.0001996773332568941, + "loss": 0.8246, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.4486434942728305, + "learning_rate": 0.0001996561924152278, + "loss": 0.7811, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.4703205860664721, + "learning_rate": 0.00019963438199960599, + "loss": 0.8238, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4795012837534885, + "learning_rate": 0.0001996119021565693, + "loss": 0.881, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.4707544400703513, + "learning_rate": 0.00019958875303715615, + "loss": 0.9132, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4266920813293674, + "learning_rate": 0.0001995649347969019, + "loss": 0.8602, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.4510992661418326, + "learning_rate": 0.0001995404475958373, + "loss": 0.9129, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5040527366718769, + "learning_rate": 0.00019951529159848805, + "loss": 0.9165, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.41313107554868533, + "learning_rate": 0.0001994894669738732, + "loss": 0.8491, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4494431303697321, + "learning_rate": 0.00019946297389550433, + "loss": 0.8507, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.4376764818391794, + "learning_rate": 0.0001994358125413841, + "loss": 0.8205, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.5122357726085491, + "learning_rate": 0.00019940798309400526, + "loss": 0.8123, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.4115765441551589, + "learning_rate": 0.0001993794857403495, + "loss": 0.8026, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.4509542037042227, + "learning_rate": 0.0001993503206718859, + "loss": 0.8328, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.42594272629359003, + "learning_rate": 0.0001993204880845699, + "loss": 0.7765, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4260909589227541, + "learning_rate": 0.00019928998817884182, + "loss": 0.8405, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.41573168903833024, + "learning_rate": 0.00019925882115962568, + "loss": 0.8231, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.39933734200305154, + "learning_rate": 0.00019922698723632767, + "loss": 0.8063, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.4345644903978346, + "learning_rate": 0.00019919448662283478, + "loss": 0.8267, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.40127693336643405, + "learning_rate": 0.00019916131953751342, + "loss": 0.7522, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.40199553950201616, + "learning_rate": 0.00019912748620320794, + "loss": 0.8032, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.48327385003933276, + "learning_rate": 0.00019909298684723904, + "loss": 0.8217, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.3908920050780294, + "learning_rate": 0.00019905782170140238, + "loss": 0.8312, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.49522901161784727, + "learning_rate": 0.00019902199100196697, + "loss": 0.8805, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.4720762873467958, + "learning_rate": 0.00019898549498967343, + "loss": 0.8353, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4673667387850208, + "learning_rate": 0.00019894833390973266, + "loss": 0.8156, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.48274563016239064, + "learning_rate": 0.000198910508011824, + "loss": 0.9052, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.38141973015887337, + "learning_rate": 0.00019887201755009357, + "loss": 0.7523, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.42718404335013443, + "learning_rate": 0.00019883286278315262, + "loss": 0.8815, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.45654683275776975, + "learning_rate": 0.0001987930439740757, + "loss": 0.79, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.39958380818714795, + "learning_rate": 0.00019875256139039902, + "loss": 0.8769, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.49023723846133654, + "learning_rate": 0.00019871141530411853, + "loss": 0.9067, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.39793474917174537, + "learning_rate": 0.00019866960599168826, + "loss": 0.7653, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.45045541780062376, + "learning_rate": 0.0001986271337340182, + "loss": 0.8277, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.44590880781273884, + "learning_rate": 0.0001985839988164726, + "loss": 0.8155, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4655315399141277, + "learning_rate": 0.00019854020152886814, + "loss": 0.8375, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.4838098171615128, + "learning_rate": 0.00019849574216547171, + "loss": 0.8239, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.40968833788240205, + "learning_rate": 0.0001984506210249986, + "loss": 0.7784, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.4729664739308913, + "learning_rate": 0.00019840483841061058, + "loss": 0.7989, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4524948610255115, + "learning_rate": 0.00019835839462991361, + "loss": 0.8658, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.42381451152368793, + "learning_rate": 0.00019831128999495606, + "loss": 0.8504, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.40214305600150546, + "learning_rate": 0.00019826352482222638, + "loss": 0.8271, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.4628861171033263, + "learning_rate": 0.0001982150994326511, + "loss": 0.8439, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.44188860733222457, + "learning_rate": 0.00019816601415159263, + "loss": 0.7837, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.4483825994700799, + "learning_rate": 0.0001981162693088471, + "loss": 0.9282, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4710324365514096, + "learning_rate": 0.0001980658652386421, + "loss": 0.8705, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.46510154454495684, + "learning_rate": 0.0001980148022796345, + "loss": 0.8115, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.405556088903293, + "learning_rate": 0.00019796308077490817, + "loss": 0.8209, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.3933253178211884, + "learning_rate": 0.00019791070107197153, + "loss": 0.7824, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4441308602732538, + "learning_rate": 0.00019785766352275542, + "loss": 0.8158, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.41600800450120323, + "learning_rate": 0.0001978039684836106, + "loss": 0.806, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.4505417888946031, + "learning_rate": 0.00019774961631530545, + "loss": 0.86, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.44419662115731584, + "learning_rate": 0.0001976946073830234, + "loss": 0.7928, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.46078770584507683, + "learning_rate": 0.00019763894205636072, + "loss": 0.759, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.4861036696461452, + "learning_rate": 0.00019758262070932375, + "loss": 0.909, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.47935094149952195, + "learning_rate": 0.00019752564372032657, + "loss": 0.8432, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.5021621797308456, + "learning_rate": 0.00019746801147218842, + "loss": 0.8899, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.41210306509262695, + "learning_rate": 0.00019740972435213115, + "loss": 0.7877, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.4398250863148764, + "learning_rate": 0.00019735078275177654, + "loss": 0.7817, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3828979564308189, + "learning_rate": 0.00019729118706714375, + "loss": 0.7777, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.4624433449613049, + "learning_rate": 0.00019723093769864663, + "loss": 0.7517, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.4598025687772892, + "learning_rate": 0.00019717003505109095, + "loss": 0.7982, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.4425508776530567, + "learning_rate": 0.0001971084795336719, + "loss": 0.8692, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4842287203577144, + "learning_rate": 0.00019704627155997108, + "loss": 0.84, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.45435282722373527, + "learning_rate": 0.00019698341154795389, + "loss": 0.8132, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.5513821385591284, + "learning_rate": 0.00019691989991996663, + "loss": 0.8852, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.44676033887524785, + "learning_rate": 0.00019685573710273376, + "loss": 0.8316, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4258188039172344, + "learning_rate": 0.0001967909235273549, + "loss": 0.7812, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.4824087947941197, + "learning_rate": 0.00019672545962930215, + "loss": 0.818, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.4187342117306936, + "learning_rate": 0.00019665934584841682, + "loss": 0.8092, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.4017058881104004, + "learning_rate": 0.00019659258262890683, + "loss": 0.6962, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.448079610421751, + "learning_rate": 0.00019652517041934356, + "loss": 0.8068, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.3986220602603605, + "learning_rate": 0.00019645710967265882, + "loss": 0.8267, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4677803646502211, + "learning_rate": 0.00019638840084614182, + "loss": 0.8303, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.3967906879343072, + "learning_rate": 0.00019631904440143612, + "loss": 0.8039, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4641287483774464, + "learning_rate": 0.00019624904080453655, + "loss": 0.8314, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.4371119687642268, + "learning_rate": 0.00019617839052578603, + "loss": 0.8006, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4251072016794893, + "learning_rate": 0.00019610709403987246, + "loss": 0.7494, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.43688346737206174, + "learning_rate": 0.0001960351518258255, + "loss": 0.765, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.42924160099480074, + "learning_rate": 0.00019596256436701324, + "loss": 0.8122, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.40941526606709683, + "learning_rate": 0.00019588933215113926, + "loss": 0.8025, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.4753545185318249, + "learning_rate": 0.000195815455670239, + "loss": 0.8097, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.4297319370236763, + "learning_rate": 0.00019574093542067673, + "loss": 0.8136, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.44389185288420413, + "learning_rate": 0.00019566577190314197, + "loss": 0.8309, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.4454598228098689, + "learning_rate": 0.0001955899656226464, + "loss": 0.8655, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4379720315213458, + "learning_rate": 0.0001955135170885202, + "loss": 0.8103, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.5229416617277775, + "learning_rate": 0.0001954364268144088, + "loss": 0.995, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3861539358658946, + "learning_rate": 0.00019535869531826937, + "loss": 0.7538, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.4277791103455093, + "learning_rate": 0.00019528032312236736, + "loss": 0.8023, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.47272910144735575, + "learning_rate": 0.00019520131075327298, + "loss": 0.8189, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.4464716182886099, + "learning_rate": 0.00019512165874185767, + "loss": 0.7289, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.4227171621774377, + "learning_rate": 0.00019504136762329047, + "loss": 0.8555, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.4395286840376491, + "learning_rate": 0.0001949604379370345, + "loss": 0.8033, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.43507375337288545, + "learning_rate": 0.00019487887022684336, + "loss": 0.8616, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.4632310561001686, + "learning_rate": 0.00019479666504075736, + "loss": 0.7553, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.43899436188218655, + "learning_rate": 0.00019471382293110003, + "loss": 0.7203, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.4237430849392596, + "learning_rate": 0.0001946303444544741, + "loss": 0.7229, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4527915263031912, + "learning_rate": 0.00019454623017175812, + "loss": 0.8702, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.43131694202801923, + "learning_rate": 0.00019446148064810242, + "loss": 0.7961, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.49437880601950374, + "learning_rate": 0.00019437609645292546, + "loss": 0.9102, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.43659462457447684, + "learning_rate": 0.00019429007815990993, + "loss": 0.7614, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.41918191497256263, + "learning_rate": 0.0001942034263469989, + "loss": 0.7852, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.4291690963237692, + "learning_rate": 0.00019411614159639204, + "loss": 0.7746, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.6143376957940382, + "learning_rate": 0.00019402822449454153, + "loss": 0.8101, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.4314498742806767, + "learning_rate": 0.00019393967563214833, + "loss": 0.7557, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.48829490311943763, + "learning_rate": 0.00019385049560415794, + "loss": 0.7513, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.43851150291390606, + "learning_rate": 0.00019376068500975667, + "loss": 0.8017, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.38046129450067623, + "learning_rate": 0.00019367024445236754, + "loss": 0.696, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.4291326347607385, + "learning_rate": 0.000193579174539646, + "loss": 0.7703, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4917391816960092, + "learning_rate": 0.00019348747588347637, + "loss": 0.8717, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.4353352524294646, + "learning_rate": 0.00019339514909996706, + "loss": 0.7938, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.4418832858666523, + "learning_rate": 0.00019330219480944694, + "loss": 0.8093, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.470468993784615, + "learning_rate": 0.00019320861363646095, + "loss": 0.8472, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.39426705301336207, + "learning_rate": 0.00019311440620976597, + "loss": 0.7785, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.4018827466079507, + "learning_rate": 0.00019301957316232658, + "loss": 0.7205, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.38227905944522594, + "learning_rate": 0.0001929241151313108, + "loss": 0.7488, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.4253768460834158, + "learning_rate": 0.0001928280327580858, + "loss": 0.7844, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.433341824982204, + "learning_rate": 0.00019273132668821364, + "loss": 0.7815, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.4416209521555928, + "learning_rate": 0.00019263399757144683, + "loss": 0.8655, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4540173442027962, + "learning_rate": 0.00019253604606172417, + "loss": 0.8846, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.4428878025484908, + "learning_rate": 0.000192437472817166, + "loss": 0.8015, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.4904578720842893, + "learning_rate": 0.00019233827850007027, + "loss": 0.8315, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.44336847472518826, + "learning_rate": 0.00019223846377690754, + "loss": 0.8542, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.49226825803525953, + "learning_rate": 0.00019213802931831696, + "loss": 0.8338, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.501756094514099, + "learning_rate": 0.00019203697579910154, + "loss": 0.7713, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.5435447056177455, + "learning_rate": 0.00019193530389822363, + "loss": 0.8257, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.44313370250703266, + "learning_rate": 0.00019183301429880043, + "loss": 0.7919, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.47342288389157794, + "learning_rate": 0.00019173010768809933, + "loss": 0.9051, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.3999241068503602, + "learning_rate": 0.00019162658475753327, + "loss": 0.7938, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4631029137866705, + "learning_rate": 0.0001915224462026563, + "loss": 0.8189, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.40421567014884724, + "learning_rate": 0.00019141769272315858, + "loss": 0.7728, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.3723392140339022, + "learning_rate": 0.00019131232502286188, + "loss": 0.7384, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.430167081220453, + "learning_rate": 0.00019120634380971496, + "loss": 0.7903, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.40241535953398005, + "learning_rate": 0.0001910997497957885, + "loss": 0.8059, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.38901205632140157, + "learning_rate": 0.0001909925436972706, + "loss": 0.7093, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4899648722987168, + "learning_rate": 0.00019088472623446183, + "loss": 0.8009, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.43679788350937493, + "learning_rate": 0.00019077629813177036, + "loss": 0.8485, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.41474942601321974, + "learning_rate": 0.00019066726011770726, + "loss": 0.7366, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.3965788027409051, + "learning_rate": 0.00019055761292488142, + "loss": 0.6985, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.45497223123457115, + "learning_rate": 0.0001904473572899947, + "loss": 0.8644, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.4283454248563262, + "learning_rate": 0.00019033649395383702, + "loss": 0.8169, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.3950155450695892, + "learning_rate": 0.00019022502366128135, + "loss": 0.8138, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.4136694620708334, + "learning_rate": 0.00019011294716127867, + "loss": 0.7565, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.39632663144028374, + "learning_rate": 0.00019000026520685302, + "loss": 0.8258, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.4114557247555061, + "learning_rate": 0.0001898869785550963, + "loss": 0.7951, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.40632063758759807, + "learning_rate": 0.0001897730879671634, + "loss": 0.777, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.42803025719964816, + "learning_rate": 0.00018965859420826684, + "loss": 0.8007, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.39737743505525436, + "learning_rate": 0.00018954349804767184, + "loss": 0.7175, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.5020832745669306, + "learning_rate": 0.00018942780025869098, + "loss": 0.8899, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4080587031809224, + "learning_rate": 0.00018931150161867916, + "loss": 0.751, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.39062572686119307, + "learning_rate": 0.00018919460290902826, + "loss": 0.7361, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.4001701095298602, + "learning_rate": 0.00018907710491516199, + "loss": 0.7102, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.4275503431259605, + "learning_rate": 0.0001889590084265304, + "loss": 0.7437, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4255501208026851, + "learning_rate": 0.0001888403142366049, + "loss": 0.8217, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.45702297406020853, + "learning_rate": 0.0001887210231428727, + "loss": 0.8768, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4441420909638248, + "learning_rate": 0.00018860113594683148, + "loss": 0.8756, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.4184670778035845, + "learning_rate": 0.0001884806534539841, + "loss": 0.7577, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.4487239911209529, + "learning_rate": 0.00018835957647383303, + "loss": 0.7787, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.4139557396543828, + "learning_rate": 0.0001882379058198751, + "loss": 0.6914, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3889468554789524, + "learning_rate": 0.00018811564230959588, + "loss": 0.7042, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.43149944673496776, + "learning_rate": 0.00018799278676446423, + "loss": 0.7721, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.36052127629596975, + "learning_rate": 0.00018786934000992688, + "loss": 0.6844, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.5080376534107922, + "learning_rate": 0.00018774530287540278, + "loss": 0.8687, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.45171658664266007, + "learning_rate": 0.00018762067619427746, + "loss": 0.7657, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.4738203786339585, + "learning_rate": 0.00018749546080389757, + "loss": 0.8564, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.48625884167655675, + "learning_rate": 0.00018736965754556528, + "loss": 0.8189, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.44129173607747263, + "learning_rate": 0.00018724326726453244, + "loss": 0.7819, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3984774646323755, + "learning_rate": 0.00018711629080999504, + "loss": 0.7441, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.41477157567690254, + "learning_rate": 0.00018698872903508755, + "loss": 0.7725, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.41718842233637154, + "learning_rate": 0.00018686058279687698, + "loss": 0.7659, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.44251795593787574, + "learning_rate": 0.0001867318529563574, + "loss": 0.8403, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.41318693117299315, + "learning_rate": 0.00018660254037844388, + "loss": 0.7442, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.48641842669637025, + "learning_rate": 0.00018647264593196688, + "loss": 0.8138, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.42387328232905025, + "learning_rate": 0.00018634217048966637, + "loss": 0.8126, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.3931945023044329, + "learning_rate": 0.00018621111492818585, + "loss": 0.7536, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3997122267628017, + "learning_rate": 0.0001860794801280666, + "loss": 0.7441, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.404588720437904, + "learning_rate": 0.00018594726697374175, + "loss": 0.7707, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.45216094511412713, + "learning_rate": 0.0001858144763535302, + "loss": 0.7601, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.4326863137511993, + "learning_rate": 0.0001856811091596308, + "loss": 0.7883, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4071967675769803, + "learning_rate": 0.0001855471662881164, + "loss": 0.7515, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.41938525195695797, + "learning_rate": 0.00018541264863892754, + "loss": 0.79, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.40042410823512975, + "learning_rate": 0.00018527755711586678, + "loss": 0.7623, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.4113599828024844, + "learning_rate": 0.00018514189262659235, + "loss": 0.7854, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4728098642283454, + "learning_rate": 0.00018500565608261214, + "loss": 0.9151, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.40323520325355633, + "learning_rate": 0.00018486884839927768, + "loss": 0.7357, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.3791079153765052, + "learning_rate": 0.00018473147049577774, + "loss": 0.7362, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.42769699257246035, + "learning_rate": 0.0001845935232951325, + "loss": 0.8031, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3746111112502971, + "learning_rate": 0.00018445500772418697, + "loss": 0.7349, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.4613476961289269, + "learning_rate": 0.00018431592471360503, + "loss": 0.794, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4303021025297545, + "learning_rate": 0.00018417627519786315, + "loss": 0.7774, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.42397060786526264, + "learning_rate": 0.000184036060115244, + "loss": 0.7682, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.48690701433814254, + "learning_rate": 0.00018389528040783012, + "loss": 0.8032, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.4105137729498451, + "learning_rate": 0.00018375393702149787, + "loss": 0.7547, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.3912642589897561, + "learning_rate": 0.00018361203090591071, + "loss": 0.7919, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.4080005214517403, + "learning_rate": 0.00018346956301451304, + "loss": 0.7707, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4337941426942863, + "learning_rate": 0.00018332653430452376, + "loss": 0.802, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.40181116809323447, + "learning_rate": 0.00018318294573692985, + "loss": 0.7609, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.41880680034037987, + "learning_rate": 0.00018303879827647975, + "loss": 0.8195, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.42910528130461506, + "learning_rate": 0.0001828940928916772, + "loss": 0.7449, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.41345533405075885, + "learning_rate": 0.00018274883055477436, + "loss": 0.7912, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.4094307996948985, + "learning_rate": 0.00018260301224176558, + "loss": 0.7605, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.3895176750928822, + "learning_rate": 0.00018245663893238075, + "loss": 0.77, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.37143027527339506, + "learning_rate": 0.00018230971161007853, + "loss": 0.716, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4062755623152971, + "learning_rate": 0.00018216223126204007, + "loss": 0.7575, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.41625469536375176, + "learning_rate": 0.00018201419887916214, + "loss": 0.8403, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.41188441901889533, + "learning_rate": 0.00018186561545605054, + "loss": 0.796, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.4341196489324385, + "learning_rate": 0.00018171648199101346, + "loss": 0.7704, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4532063166425961, + "learning_rate": 0.00018156679948605467, + "loss": 0.8187, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.44091211069612063, + "learning_rate": 0.00018141656894686689, + "loss": 0.7616, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4069755993489469, + "learning_rate": 0.00018126579138282503, + "loss": 0.8089, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.4382020418168502, + "learning_rate": 0.00018111446780697929, + "loss": 0.7732, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.3855838394816359, + "learning_rate": 0.0001809625992360485, + "loss": 0.7443, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.4479039803767857, + "learning_rate": 0.00018081018669041324, + "loss": 0.7873, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.44642356676805167, + "learning_rate": 0.00018065723119410884, + "loss": 0.854, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.3721039804434336, + "learning_rate": 0.00018050373377481878, + "loss": 0.6817, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.40861524285292483, + "learning_rate": 0.00018034969546386757, + "loss": 0.7492, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.4416102456142807, + "learning_rate": 0.0001801951172962139, + "loss": 0.735, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.43530327527758694, + "learning_rate": 0.0001800400003104436, + "loss": 0.7981, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.4374583283813062, + "learning_rate": 0.0001798843455487629, + "loss": 0.7864, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.6698972887381797, + "learning_rate": 0.00017972815405699103, + "loss": 0.7676, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.5739032302057141, + "learning_rate": 0.00017957142688455362, + "loss": 0.8291, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.3962191799864678, + "learning_rate": 0.00017941416508447536, + "loss": 0.7208, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.40797488365072615, + "learning_rate": 0.00017925636971337304, + "loss": 0.7671, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.40762829380890714, + "learning_rate": 0.0001790980418314484, + "loss": 0.7676, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.3827737010265142, + "learning_rate": 0.00017893918250248104, + "loss": 0.7365, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.3812057580305516, + "learning_rate": 0.00017877979279382135, + "loss": 0.751, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.37992151906117433, + "learning_rate": 0.00017861987377638312, + "loss": 0.7454, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3590342086282725, + "learning_rate": 0.0001784594265246366, + "loss": 0.7102, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.3989236843068554, + "learning_rate": 0.0001782984521166011, + "loss": 0.7709, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.380756116585582, + "learning_rate": 0.0001781369516338378, + "loss": 0.7821, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.39172912345873656, + "learning_rate": 0.00017797492616144256, + "loss": 0.7082, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.3920341448417732, + "learning_rate": 0.00017781237678803847, + "loss": 0.7288, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.432987963265769, + "learning_rate": 0.00017764930460576866, + "loss": 0.8452, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.43805844914407543, + "learning_rate": 0.000177485710710289, + "loss": 0.8103, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.39726718393916, + "learning_rate": 0.00017732159620076053, + "loss": 0.7568, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4927533672406732, + "learning_rate": 0.00017715696217984235, + "loss": 0.8236, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.38944794969047297, + "learning_rate": 0.00017699180975368396, + "loss": 0.6928, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.42662679526881486, + "learning_rate": 0.00017682614003191807, + "loss": 0.7484, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.43251903973122213, + "learning_rate": 0.00017665995412765285, + "loss": 0.7706, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4819805473665938, + "learning_rate": 0.00017649325315746478, + "loss": 0.8178, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.4340033691860238, + "learning_rate": 0.00017632603824139085, + "loss": 0.7853, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.4198050552753312, + "learning_rate": 0.0001761583105029213, + "loss": 0.7306, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.3667608095911679, + "learning_rate": 0.0001759900710689918, + "loss": 0.7313, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.45854263014244123, + "learning_rate": 0.00017582132106997616, + "loss": 0.8512, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.3870267720499676, + "learning_rate": 0.00017565206163967846, + "loss": 0.7389, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.3629421043115878, + "learning_rate": 0.00017548229391532572, + "loss": 0.7279, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.45435742792899936, + "learning_rate": 0.00017531201903755994, + "loss": 0.7784, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.38769257652266653, + "learning_rate": 0.00017514123815043074, + "loss": 0.6862, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.43544323597179285, + "learning_rate": 0.00017496995240138744, + "loss": 0.7836, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4303548195748457, + "learning_rate": 0.00017479816294127152, + "loss": 0.7015, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.3988815649130947, + "learning_rate": 0.00017462587092430875, + "loss": 0.7428, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.36695351109613955, + "learning_rate": 0.0001744530775081015, + "loss": 0.6559, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.4536080222581322, + "learning_rate": 0.00017427978385362112, + "loss": 0.7842, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.47821792563657084, + "learning_rate": 0.0001741059911251997, + "loss": 0.8441, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.3704778738018211, + "learning_rate": 0.0001739317004905227, + "loss": 0.6822, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.428327767075066, + "learning_rate": 0.000173756913120621, + "loss": 0.7791, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.3961450976529274, + "learning_rate": 0.00017358163018986282, + "loss": 0.7394, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.41929806219428845, + "learning_rate": 0.00017340585287594604, + "loss": 0.7883, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.3894422610473363, + "learning_rate": 0.00017322958235989016, + "loss": 0.7458, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.36295835419452754, + "learning_rate": 0.0001730528198260285, + "loss": 0.684, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.3569673265635979, + "learning_rate": 0.00017287556646200018, + "loss": 0.7176, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.3865910785194446, + "learning_rate": 0.00017269782345874203, + "loss": 0.7707, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.38216796057368174, + "learning_rate": 0.00017251959201048083, + "loss": 0.704, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.45114147371211993, + "learning_rate": 0.00017234087331472497, + "loss": 0.7484, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.3646183328954607, + "learning_rate": 0.00017216166857225674, + "loss": 0.671, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4368523232160318, + "learning_rate": 0.00017198197898712404, + "loss": 0.755, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.4247692667642482, + "learning_rate": 0.00017180180576663228, + "loss": 0.7828, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4901671777097176, + "learning_rate": 0.00017162115012133643, + "loss": 0.8043, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.4178507084032011, + "learning_rate": 0.00017144001326503273, + "loss": 0.834, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.42204066116677874, + "learning_rate": 0.00017125839641475072, + "loss": 0.7418, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.4081564035171982, + "learning_rate": 0.00017107630079074478, + "loss": 0.7796, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.40402309874689885, + "learning_rate": 0.00017089372761648616, + "loss": 0.7859, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.42467107811937443, + "learning_rate": 0.00017071067811865476, + "loss": 0.754, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.3954585462658397, + "learning_rate": 0.00017052715352713075, + "loss": 0.7454, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.4699401441706718, + "learning_rate": 0.00017034315507498635, + "loss": 0.7971, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.43203384628124064, + "learning_rate": 0.00017015868399847768, + "loss": 0.7608, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.3989431542163795, + "learning_rate": 0.00016997374153703625, + "loss": 0.7594, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.5004464475895747, + "learning_rate": 0.00016978832893326074, + "loss": 0.8348, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.44866367241052296, + "learning_rate": 0.00016960244743290868, + "loss": 0.8461, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4208747472469387, + "learning_rate": 0.00016941609828488807, + "loss": 0.7646, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.4250138345747438, + "learning_rate": 0.00016922928274124886, + "loss": 0.7661, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.42876210270998055, + "learning_rate": 0.0001690420020571747, + "loss": 0.7558, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.3919074809576166, + "learning_rate": 0.00016885425749097444, + "loss": 0.731, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4390458646036582, + "learning_rate": 0.0001686660503040737, + "loss": 0.8194, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.40728141579339516, + "learning_rate": 0.00016847738176100632, + "loss": 0.7575, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4007468595299471, + "learning_rate": 0.00016828825312940592, + "loss": 0.7493, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 1.4849889980591207, + "learning_rate": 0.0001680986656799975, + "loss": 0.7374, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.45413652126351345, + "learning_rate": 0.0001679086206865886, + "loss": 0.7726, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.345963315677467, + "learning_rate": 0.00016771811942606108, + "loss": 0.6959, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4270195519889394, + "learning_rate": 0.00016752716317836229, + "loss": 0.8031, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.443768314112945, + "learning_rate": 0.00016733575322649657, + "loss": 0.7047, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.44007610886830467, + "learning_rate": 0.0001671438908565167, + "loss": 0.8055, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.41782305359904554, + "learning_rate": 0.00016695157735751513, + "loss": 0.7742, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.46127543477045563, + "learning_rate": 0.00016675881402161536, + "loss": 0.812, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.4708279566376157, + "learning_rate": 0.0001665656021439633, + "loss": 0.8455, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.3820793856650901, + "learning_rate": 0.0001663719430227186, + "loss": 0.7407, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.4418698534422898, + "learning_rate": 0.00016617783795904565, + "loss": 0.8059, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.42559259411748557, + "learning_rate": 0.00016598328825710533, + "loss": 0.7907, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.40526252997293044, + "learning_rate": 0.00016578829522404583, + "loss": 0.7387, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4537639887124046, + "learning_rate": 0.000165592860169994, + "loss": 0.8205, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.41607678697428313, + "learning_rate": 0.00016539698440804661, + "loss": 0.7634, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.46647291980529587, + "learning_rate": 0.00016520066925426144, + "loss": 0.8456, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.42211238390913575, + "learning_rate": 0.0001650039160276485, + "loss": 0.7943, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.7479041557290732, + "learning_rate": 0.0001648067260501611, + "loss": 0.8312, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.44974913045518755, + "learning_rate": 0.0001646091006466871, + "loss": 0.7288, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.42900284177400566, + "learning_rate": 0.0001644110411450398, + "loss": 0.7376, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.43745295104653914, + "learning_rate": 0.00016421254887594917, + "loss": 0.8216, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.34707937466301986, + "learning_rate": 0.00016401362517305296, + "loss": 0.6859, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.4857945786176909, + "learning_rate": 0.00016381427137288754, + "loss": 0.8421, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.39121290861369795, + "learning_rate": 0.00016361448881487914, + "loss": 0.7159, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.46354388130478685, + "learning_rate": 0.0001634142788413346, + "loss": 0.7939, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.4270592250949217, + "learning_rate": 0.00016321364279743266, + "loss": 0.8159, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.3871489562745451, + "learning_rate": 0.00016301258203121462, + "loss": 0.7152, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.43235168230976634, + "learning_rate": 0.0001628110978935756, + "loss": 0.7932, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.39913774093727283, + "learning_rate": 0.00016260919173825508, + "loss": 0.7317, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4100573891738185, + "learning_rate": 0.00016240686492182804, + "loss": 0.7645, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.4231866067832828, + "learning_rate": 0.00016220411880369601, + "loss": 0.7223, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.43092174954976703, + "learning_rate": 0.00016200095474607753, + "loss": 0.7011, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.44583569757552366, + "learning_rate": 0.00016179737411399926, + "loss": 0.8321, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4484364582764281, + "learning_rate": 0.00016159337827528685, + "loss": 0.7902, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.3756430709642739, + "learning_rate": 0.00016138896860055555, + "loss": 0.7238, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.4302534613418457, + "learning_rate": 0.0001611841464632011, + "loss": 0.7639, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.4366161009671638, + "learning_rate": 0.00016097891323939062, + "loss": 0.8006, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4080078257128167, + "learning_rate": 0.0001607732703080532, + "loss": 0.7529, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.38655595838544954, + "learning_rate": 0.00016056721905087056, + "loss": 0.7384, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.42310522433274306, + "learning_rate": 0.00016036076085226814, + "loss": 0.6947, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.4425047945464679, + "learning_rate": 0.00016015389709940538, + "loss": 0.7619, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.37997601824260263, + "learning_rate": 0.0001599466291821666, + "loss": 0.7174, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.4546294953554832, + "learning_rate": 0.0001597389584931517, + "loss": 0.7458, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.39330551079256165, + "learning_rate": 0.0001595308864276666, + "loss": 0.6811, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.45042609210794154, + "learning_rate": 0.0001593224143837142, + "loss": 0.7373, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.3637892815630396, + "learning_rate": 0.0001591135437619847, + "loss": 0.6868, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.40543358824559494, + "learning_rate": 0.00015890427596584617, + "loss": 0.758, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4121138567827428, + "learning_rate": 0.0001586946124013354, + "loss": 0.7679, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.40666299722383215, + "learning_rate": 0.00015848455447714822, + "loss": 0.7196, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.41336832248445105, + "learning_rate": 0.0001582741036046301, + "loss": 0.7178, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.44566015950782456, + "learning_rate": 0.00015806326119776663, + "loss": 0.8355, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.41409975993872233, + "learning_rate": 0.00015785202867317407, + "loss": 0.7177, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.42395424691211214, + "learning_rate": 0.00015764040745008988, + "loss": 0.8186, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.346635875303523, + "learning_rate": 0.00015742839895036305, + "loss": 0.6821, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.4176984516392487, + "learning_rate": 0.00015721600459844468, + "loss": 0.797, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.45983357652109397, + "learning_rate": 0.00015700322582137827, + "loss": 0.8478, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.4077699212094806, + "learning_rate": 0.00015679006404879033, + "loss": 0.7857, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.36392275707834104, + "learning_rate": 0.0001565765207128805, + "loss": 0.6917, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.40663804131126274, + "learning_rate": 0.00015636259724841222, + "loss": 0.7344, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4342493920150342, + "learning_rate": 0.0001561482950927029, + "loss": 0.8365, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.4227117158794363, + "learning_rate": 0.00015593361568561428, + "loss": 0.7778, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.41270891685667144, + "learning_rate": 0.00015571856046954285, + "loss": 0.7261, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.4158768104130257, + "learning_rate": 0.0001555031308894101, + "loss": 0.7938, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.3775853119004546, + "learning_rate": 0.00015528732839265272, + "loss": 0.7217, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.3941198166676895, + "learning_rate": 0.0001550711544292131, + "loss": 0.6884, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.42915734195324934, + "learning_rate": 0.0001548546104515294, + "loss": 0.7512, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.38564142369791643, + "learning_rate": 0.00015463769791452574, + "loss": 0.6865, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.3693977836154458, + "learning_rate": 0.00015442041827560274, + "loss": 0.6848, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.4200972826590631, + "learning_rate": 0.00015420277299462736, + "loss": 0.7806, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4117289702100441, + "learning_rate": 0.00015398476353392323, + "loss": 0.7887, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.3958065401208961, + "learning_rate": 0.00015376639135826107, + "loss": 0.7447, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.379122263890142, + "learning_rate": 0.00015354765793484834, + "loss": 0.7405, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.38761228847035534, + "learning_rate": 0.00015332856473331978, + "loss": 0.7099, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.37619366133112303, + "learning_rate": 0.00015310911322572753, + "loss": 0.7609, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.4562285195993237, + "learning_rate": 0.00015288930488653094, + "loss": 0.7751, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.41552372433757795, + "learning_rate": 0.000152669141192587, + "loss": 0.7491, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.45218399720416264, + "learning_rate": 0.0001524486236231402, + "loss": 0.8822, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.36646616666120246, + "learning_rate": 0.00015222775365981273, + "loss": 0.7087, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.4440092106035693, + "learning_rate": 0.00015200653278659432, + "loss": 0.7601, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4264021944001832, + "learning_rate": 0.00015178496248983254, + "loss": 0.7661, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.4830509634544988, + "learning_rate": 0.00015156304425822267, + "loss": 0.7945, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.47615679985015075, + "learning_rate": 0.00015134077958279765, + "loss": 0.8693, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.405127275114156, + "learning_rate": 0.00015111816995691809, + "loss": 0.7853, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.43826022955665683, + "learning_rate": 0.00015089521687626243, + "loss": 0.7163, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.43433703098792414, + "learning_rate": 0.00015067192183881658, + "loss": 0.7845, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.4056094338704598, + "learning_rate": 0.000150448286344864, + "loss": 0.7391, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.5160656198284714, + "learning_rate": 0.00015022431189697568, + "loss": 0.8635, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.3876592269599419, + "learning_rate": 0.00015000000000000001, + "loss": 0.7177, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.424270065327115, + "learning_rate": 0.0001497753521610526, + "loss": 0.7222, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.37854882642483223, + "learning_rate": 0.00014955036988950618, + "loss": 0.7027, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.431669147423509, + "learning_rate": 0.00014932505469698052, + "loss": 0.7044, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.40362916033901547, + "learning_rate": 0.00014909940809733222, + "loss": 0.7245, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.3651916039821122, + "learning_rate": 0.0001488734316066446, + "loss": 0.7154, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3948131618499892, + "learning_rate": 0.00014864712674321734, + "loss": 0.7141, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.43297346709780593, + "learning_rate": 0.0001484204950275565, + "loss": 0.7856, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.4081859143003985, + "learning_rate": 0.00014819353798236427, + "loss": 0.7498, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.4239090344818062, + "learning_rate": 0.00014796625713252848, + "loss": 0.7557, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3956920569834454, + "learning_rate": 0.00014773865400511272, + "loss": 0.7802, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.38601196388397474, + "learning_rate": 0.00014751073012934587, + "loss": 0.738, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.42270336528219926, + "learning_rate": 0.00014728248703661182, + "loss": 0.7315, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.4199661760942862, + "learning_rate": 0.0001470539262604393, + "loss": 0.7757, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.39532678138017263, + "learning_rate": 0.00014682504933649144, + "loss": 0.7166, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.3451845364961653, + "learning_rate": 0.00014659585780255556, + "loss": 0.6645, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.3759791521821676, + "learning_rate": 0.00014636635319853275, + "loss": 0.7803, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.3946098699766991, + "learning_rate": 0.0001461365370664276, + "loss": 0.74, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.42363585128536285, + "learning_rate": 0.00014590641095033787, + "loss": 0.7539, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.4292403665539383, + "learning_rate": 0.00014567597639644387, + "loss": 0.704, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.4223905551143545, + "learning_rate": 0.00014544523495299842, + "loss": 0.7423, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.4083339126716739, + "learning_rate": 0.00014521418817031628, + "loss": 0.7534, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4266340273316555, + "learning_rate": 0.0001449828376007636, + "loss": 0.7944, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.4244010609467904, + "learning_rate": 0.00014475118479874774, + "loss": 0.7322, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.3792106682271149, + "learning_rate": 0.0001445192313207067, + "loss": 0.6975, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.3978947925627141, + "learning_rate": 0.0001442869787250987, + "loss": 0.7467, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4470667818288522, + "learning_rate": 0.0001440544285723915, + "loss": 0.7744, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.42487360452471123, + "learning_rate": 0.00014382158242505234, + "loss": 0.7553, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.409805908518081, + "learning_rate": 0.00014358844184753712, + "loss": 0.7162, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.4529557352357633, + "learning_rate": 0.00014335500840627986, + "loss": 0.7749, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.399071833197901, + "learning_rate": 0.00014312128366968243, + "loss": 0.8104, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.4115889798381014, + "learning_rate": 0.0001428872692081038, + "loss": 0.8162, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.37200763367107226, + "learning_rate": 0.00014265296659384956, + "loss": 0.7473, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.3633670300337601, + "learning_rate": 0.00014241837740116132, + "loss": 0.7011, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3858944272926558, + "learning_rate": 0.00014218350320620624, + "loss": 0.7235, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.40358992250738557, + "learning_rate": 0.00014194834558706632, + "loss": 0.7344, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.4165317885836963, + "learning_rate": 0.0001417129061237278, + "loss": 0.7464, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.4161390699346155, + "learning_rate": 0.0001414771863980707, + "loss": 0.8025, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.5327238138409245, + "learning_rate": 0.00014124118799385796, + "loss": 0.9732, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.37746784110298387, + "learning_rate": 0.00014100491249672498, + "loss": 0.6851, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4096403959250972, + "learning_rate": 0.00014076836149416887, + "loss": 0.7213, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.36094519904258365, + "learning_rate": 0.0001405315365755379, + "loss": 0.6588, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3926746452925469, + "learning_rate": 0.0001402944393320206, + "loss": 0.7005, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.42257281513267286, + "learning_rate": 0.00014005707135663527, + "loss": 0.8084, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.3424676525058974, + "learning_rate": 0.00013981943424421932, + "loss": 0.638, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.38916858808887456, + "learning_rate": 0.00013958152959141825, + "loss": 0.7487, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4246159738231621, + "learning_rate": 0.00013934335899667527, + "loss": 0.773, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.4281633221166481, + "learning_rate": 0.00013910492406022033, + "loss": 0.7671, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.48387099878159534, + "learning_rate": 0.00013886622638405952, + "loss": 0.8217, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.3703953881605659, + "learning_rate": 0.0001386272675719642, + "loss": 0.7481, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.38374095321400503, + "learning_rate": 0.00013838804922946027, + "loss": 0.7198, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.44187232005789906, + "learning_rate": 0.00013814857296381728, + "loss": 0.7858, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.42479757659136425, + "learning_rate": 0.00013790884038403795, + "loss": 0.7582, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.37778309773123786, + "learning_rate": 0.00013766885310084688, + "loss": 0.7027, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.38595074168769156, + "learning_rate": 0.00013742861272668012, + "loss": 0.6871, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.47644774807222423, + "learning_rate": 0.00013718812087567414, + "loss": 0.8481, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.4438823472718032, + "learning_rate": 0.00013694737916365517, + "loss": 0.7446, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.39444101812002935, + "learning_rate": 0.000136706389208128, + "loss": 0.7029, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.3989016333638246, + "learning_rate": 0.00013646515262826552, + "loss": 0.7502, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.3929916527658517, + "learning_rate": 0.00013622367104489756, + "loss": 0.7168, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.4330542431318848, + "learning_rate": 0.0001359819460805001, + "loss": 0.7895, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.46511998999871673, + "learning_rate": 0.0001357399793591844, + "loss": 0.779, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4120613132882409, + "learning_rate": 0.0001354977725066859, + "loss": 0.7321, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.42892550263482604, + "learning_rate": 0.00013525532715035366, + "loss": 0.7812, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.41065320820919565, + "learning_rate": 0.00013501264491913906, + "loss": 0.7743, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.38414982621095234, + "learning_rate": 0.00013476972744358507, + "loss": 0.6896, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3955180803765117, + "learning_rate": 0.0001345265763558152, + "loss": 0.7284, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.43118579124945994, + "learning_rate": 0.00013428319328952253, + "loss": 0.7303, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.3803715757708683, + "learning_rate": 0.00013403957987995882, + "loss": 0.6969, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.42238276187713786, + "learning_rate": 0.0001337957377639235, + "loss": 0.7433, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.43112653439928567, + "learning_rate": 0.0001335516685797525, + "loss": 0.7976, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.4177853496713614, + "learning_rate": 0.0001333073739673076, + "loss": 0.7387, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3876745806420402, + "learning_rate": 0.00013306285556796495, + "loss": 0.6952, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.3839120757027477, + "learning_rate": 0.0001328181150246045, + "loss": 0.6984, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.430712755838551, + "learning_rate": 0.00013257315398159864, + "loss": 0.7869, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.36874939887173747, + "learning_rate": 0.00013232797408480127, + "loss": 0.7045, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.39771894299769867, + "learning_rate": 0.00013208257698153677, + "loss": 0.7074, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.40636985692015276, + "learning_rate": 0.00013183696432058888, + "loss": 0.7234, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.4362192462858971, + "learning_rate": 0.00013159113775218964, + "loss": 0.7513, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.41059237448087205, + "learning_rate": 0.00013134509892800822, + "loss": 0.7866, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.45428657486743507, + "learning_rate": 0.00013109884950114007, + "loss": 0.7614, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.3826319609724033, + "learning_rate": 0.00013085239112609547, + "loss": 0.7032, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3986937022304439, + "learning_rate": 0.00013060572545878875, + "loss": 0.6939, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.3855053555062931, + "learning_rate": 0.00013035885415652685, + "loss": 0.6806, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.422175727203896, + "learning_rate": 0.00013011177887799845, + "loss": 0.751, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.4114716181367486, + "learning_rate": 0.00012986450128326266, + "loss": 0.7218, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.42006078486237336, + "learning_rate": 0.00012961702303373795, + "loss": 0.7178, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.37219227951366723, + "learning_rate": 0.00012936934579219094, + "loss": 0.6568, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.4059513618713587, + "learning_rate": 0.00012912147122272523, + "loss": 0.7555, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.4305993332670556, + "learning_rate": 0.00012887340099077024, + "loss": 0.7678, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.35206527911535446, + "learning_rate": 0.00012862513676307008, + "loss": 0.6382, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.41113202720973163, + "learning_rate": 0.0001283766802076722, + "loss": 0.7057, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3901613555563609, + "learning_rate": 0.00012812803299391628, + "loss": 0.7342, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.4113244043759376, + "learning_rate": 0.00012787919679242306, + "loss": 0.78, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4017800562369879, + "learning_rate": 0.00012763017327508305, + "loss": 0.744, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.41562416829574583, + "learning_rate": 0.00012738096411504522, + "loss": 0.7514, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.39518105073910975, + "learning_rate": 0.0001271315709867059, + "loss": 0.6914, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.3695062349334831, + "learning_rate": 0.00012688199556569753, + "loss": 0.7162, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.3821071049370452, + "learning_rate": 0.00012663223952887723, + "loss": 0.7041, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.3895151331287037, + "learning_rate": 0.0001263823045543158, + "loss": 0.7108, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4555783174734416, + "learning_rate": 0.00012613219232128608, + "loss": 0.7449, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.40261820838834467, + "learning_rate": 0.00012588190451025207, + "loss": 0.8089, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4537195432426405, + "learning_rate": 0.00012563144280285741, + "loss": 0.8016, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.41674676712927666, + "learning_rate": 0.00012538080888191408, + "loss": 0.7191, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3615959762032306, + "learning_rate": 0.00012513000443139112, + "loss": 0.6631, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.4045962545723302, + "learning_rate": 0.00012487903113640337, + "loss": 0.731, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.39050807685917494, + "learning_rate": 0.00012462789068320017, + "loss": 0.7772, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.3757834980550243, + "learning_rate": 0.00012437658475915377, + "loss": 0.6894, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.3689254682213251, + "learning_rate": 0.00012412511505274844, + "loss": 0.6835, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.42097646607624756, + "learning_rate": 0.00012387348325356874, + "loss": 0.7117, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3528241671936896, + "learning_rate": 0.00012362169105228826, + "loss": 0.6413, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.3809898049081442, + "learning_rate": 0.00012336974014065844, + "loss": 0.7273, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.40189423114516165, + "learning_rate": 0.000123117632211497, + "loss": 0.7671, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.44260966954087205, + "learning_rate": 0.00012286536895867654, + "loss": 0.7555, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4121804448249583, + "learning_rate": 0.00012261295207711346, + "loss": 0.7634, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.4147017931293983, + "learning_rate": 0.00012236038326275626, + "loss": 0.7624, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3966852226772586, + "learning_rate": 0.0001221076642125742, + "loss": 0.7028, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.40152430826623603, + "learning_rate": 0.00012185479662454595, + "loss": 0.6969, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.45074530522342926, + "learning_rate": 0.00012160178219764837, + "loss": 0.7938, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.4388723825809723, + "learning_rate": 0.00012134862263184467, + "loss": 0.8302, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.40997854160175323, + "learning_rate": 0.00012109531962807332, + "loss": 0.8194, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.37438786055649476, + "learning_rate": 0.00012084187488823657, + "loss": 0.7173, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3972270190292053, + "learning_rate": 0.00012058829011518896, + "loss": 0.7061, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.3795007296042973, + "learning_rate": 0.00012033456701272576, + "loss": 0.6833, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.39273732706955466, + "learning_rate": 0.00012008070728557186, + "loss": 0.6964, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.39557876005056253, + "learning_rate": 0.00011982671263936995, + "loss": 0.7345, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.37767135940417723, + "learning_rate": 0.00011957258478066931, + "loss": 0.6585, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.38797574363602066, + "learning_rate": 0.00011931832541691418, + "loss": 0.7205, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.3630726329629002, + "learning_rate": 0.00011906393625643244, + "loss": 0.677, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.4387536553028977, + "learning_rate": 0.00011880941900842397, + "loss": 0.7619, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.38464703104051634, + "learning_rate": 0.00011855477538294935, + "loss": 0.7789, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.39167937019149635, + "learning_rate": 0.00011830000709091815, + "loss": 0.6815, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.3977451991379984, + "learning_rate": 0.00011804511584407763, + "loss": 0.6787, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.3931384610009728, + "learning_rate": 0.0001177901033550012, + "loss": 0.7128, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4069419597097396, + "learning_rate": 0.00011753497133707679, + "loss": 0.7026, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.39761087525955485, + "learning_rate": 0.00011727972150449544, + "loss": 0.7373, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.3946874053496708, + "learning_rate": 0.00011702435557223987, + "loss": 0.6923, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.38192717118076763, + "learning_rate": 0.00011676887525607271, + "loss": 0.7299, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.41838798523777854, + "learning_rate": 0.00011651328227252517, + "loss": 0.7477, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.3694964924391603, + "learning_rate": 0.00011625757833888551, + "loss": 0.6517, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.3969000692318527, + "learning_rate": 0.00011600176517318741, + "loss": 0.6973, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.37636076226194104, + "learning_rate": 0.0001157458444941984, + "loss": 0.7195, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.43492259485161183, + "learning_rate": 0.00011548981802140848, + "loss": 0.7638, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.4126453745393548, + "learning_rate": 0.00011523368747501839, + "loss": 0.7832, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4108148514985448, + "learning_rate": 0.00011497745457592816, + "loss": 0.721, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.5666162307393274, + "learning_rate": 0.00011472112104572547, + "loss": 0.8376, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3817260351961357, + "learning_rate": 0.00011446468860667421, + "loss": 0.6471, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.43432725967521973, + "learning_rate": 0.0001142081589817027, + "loss": 0.7645, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.427447555440724, + "learning_rate": 0.00011395153389439233, + "loss": 0.7164, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.34959973158873514, + "learning_rate": 0.00011369481506896582, + "loss": 0.6934, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4375141744900696, + "learning_rate": 0.00011343800423027582, + "loss": 0.6958, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.41045570654286445, + "learning_rate": 0.00011318110310379301, + "loss": 0.7066, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.43400068046834894, + "learning_rate": 0.0001129241134155949, + "loss": 0.748, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.3965691779769979, + "learning_rate": 0.00011266703689235394, + "loss": 0.6862, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.42202372939300253, + "learning_rate": 0.00011240987526132594, + "loss": 0.7473, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.38843352566967987, + "learning_rate": 0.00011215263025033869, + "loss": 0.6793, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.40712476358977573, + "learning_rate": 0.00011189530358778005, + "loss": 0.7815, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.45280542740712704, + "learning_rate": 0.00011163789700258655, + "loss": 0.8345, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.3988135723412154, + "learning_rate": 0.00011138041222423177, + "loss": 0.6403, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.4104719902931324, + "learning_rate": 0.00011112285098271451, + "loss": 0.8083, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.43497733154663787, + "learning_rate": 0.00011086521500854745, + "loss": 0.786, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.4282110309376388, + "learning_rate": 0.00011060750603274535, + "loss": 0.8164, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3805016494025352, + "learning_rate": 0.00011034972578681338, + "loss": 0.7215, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.3776584079691621, + "learning_rate": 0.00011009187600273566, + "loss": 0.7185, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.36549179775783314, + "learning_rate": 0.00010983395841296348, + "loss": 0.6423, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.42155805120324596, + "learning_rate": 0.00010957597475040373, + "loss": 0.7358, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3797360196402241, + "learning_rate": 0.00010931792674840718, + "loss": 0.7047, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.40773799010081985, + "learning_rate": 0.00010905981614075693, + "loss": 0.6931, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.3610044222301794, + "learning_rate": 0.00010880164466165674, + "loss": 0.6668, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.4399774369868542, + "learning_rate": 0.00010854341404571928, + "loss": 0.7299, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3795477589972217, + "learning_rate": 0.00010828512602795462, + "loss": 0.6782, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.3976887802313931, + "learning_rate": 0.00010802678234375851, + "loss": 0.6831, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4646173856321158, + "learning_rate": 0.00010776838472890065, + "loss": 0.7669, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.3779829407750109, + "learning_rate": 0.0001075099349195131, + "loss": 0.7257, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.39575116618933087, + "learning_rate": 0.00010725143465207867, + "loss": 0.6947, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.39042799949121, + "learning_rate": 0.00010699288566341914, + "loss": 0.6909, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.40122823176564265, + "learning_rate": 0.00010673428969068364, + "loss": 0.72, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.4063853232863709, + "learning_rate": 0.000106475648471337, + "loss": 0.7005, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.3620850889881979, + "learning_rate": 0.00010621696374314807, + "loss": 0.7287, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.4602973221668992, + "learning_rate": 0.00010595823724417795, + "loss": 0.7608, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.34145601526624864, + "learning_rate": 0.00010569947071276847, + "loss": 0.6831, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.40756445625679594, + "learning_rate": 0.00010544066588753044, + "loss": 0.7884, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4071185750181592, + "learning_rate": 0.00010518182450733186, + "loss": 0.7479, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.3645809044587315, + "learning_rate": 0.00010492294831128641, + "loss": 0.6517, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.39643622655244104, + "learning_rate": 0.00010466403903874176, + "loss": 0.7431, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.41844749866098513, + "learning_rate": 0.00010440509842926767, + "loss": 0.7584, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4490939032967661, + "learning_rate": 0.00010414612822264455, + "loss": 0.776, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.41286117568180564, + "learning_rate": 0.00010388713015885161, + "loss": 0.7442, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.394910202672662, + "learning_rate": 0.00010362810597805526, + "loss": 0.7218, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.34524918277952554, + "learning_rate": 0.00010336905742059742, + "loss": 0.6657, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4013001926497867, + "learning_rate": 0.0001031099862269837, + "loss": 0.7162, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.3626432188482639, + "learning_rate": 0.0001028508941378719, + "loss": 0.6827, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.40406396202226275, + "learning_rate": 0.00010259178289406011, + "loss": 0.7414, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.35123972431962647, + "learning_rate": 0.00010233265423647523, + "loss": 0.6682, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.42085291283779447, + "learning_rate": 0.00010207350990616107, + "loss": 0.7348, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.3919260383650782, + "learning_rate": 0.00010181435164426676, + "loss": 0.6383, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3846362087718709, + "learning_rate": 0.0001015551811920351, + "loss": 0.7089, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.39416896980117394, + "learning_rate": 0.00010129600029079072, + "loss": 0.7382, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.43852824981762756, + "learning_rate": 0.00010103681068192845, + "loss": 0.7824, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.4371752703502908, + "learning_rate": 0.00010077761410690172, + "loss": 0.7316, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.3680561270793559, + "learning_rate": 0.00010051841230721065, + "loss": 0.6562, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.413711406358694, + "learning_rate": 0.00010025920702439051, + "loss": 0.7163, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3930940635380676, + "learning_rate": 0.0001, + "loss": 0.7017, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.4046241071819806, + "learning_rate": 9.97407929756095e-05, + "loss": 0.7353, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.36892581761882737, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7146, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.35157467797738684, + "learning_rate": 9.92223858930983e-05, + "loss": 0.6643, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3708634742026403, + "learning_rate": 9.896318931807155e-05, + "loss": 0.696, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.3550674722920594, + "learning_rate": 9.870399970920932e-05, + "loss": 0.67, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.3775707423902258, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6261, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.38402987317749526, + "learning_rate": 9.818564835573323e-05, + "loss": 0.695, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.34315102173734846, + "learning_rate": 9.792649009383899e-05, + "loss": 0.652, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.42916661417714574, + "learning_rate": 9.766734576352478e-05, + "loss": 0.7455, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4010076974964933, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7155, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.36917368065934286, + "learning_rate": 9.714910586212816e-05, + "loss": 0.6521, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3991390058129583, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7011, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.39216997183273594, + "learning_rate": 9.663094257940258e-05, + "loss": 0.7065, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.442582130429027, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6977, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.3999220733985173, + "learning_rate": 9.611286984114841e-05, + "loss": 0.7035, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.4335486544309202, + "learning_rate": 9.585387177735547e-05, + "loss": 0.687, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.42951244215856904, + "learning_rate": 9.559490157073236e-05, + "loss": 0.7494, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3916325898469945, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6748, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.39839736695567196, + "learning_rate": 9.507705168871358e-05, + "loss": 0.7084, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3602102298252946, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6287, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.39360077695176615, + "learning_rate": 9.455933411246958e-05, + "loss": 0.6908, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3813408459588176, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6999, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.3824402547979255, + "learning_rate": 9.404176275582208e-05, + "loss": 0.6541, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.5083614991019088, + "learning_rate": 9.378303625685195e-05, + "loss": 0.855, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.445603718596326, + "learning_rate": 9.352435152866298e-05, + "loss": 0.7599, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.47146828085890363, + "learning_rate": 9.326571030931637e-05, + "loss": 0.8023, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.43408367978037793, + "learning_rate": 9.300711433658087e-05, + "loss": 0.7014, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.425448138566533, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7164, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.39227384599061355, + "learning_rate": 9.249006508048694e-05, + "loss": 0.6953, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.38518524255604936, + "learning_rate": 9.223161527109937e-05, + "loss": 0.7348, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.3780220958648424, + "learning_rate": 9.197321765624152e-05, + "loss": 0.6585, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.37491163307793673, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7239, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.4297851476523369, + "learning_rate": 9.145658595428074e-05, + "loss": 0.7229, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.4270389158180757, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7099, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.44843851052946626, + "learning_rate": 9.09401838592431e-05, + "loss": 0.7708, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.39071195168014267, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7267, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.3686899418824359, + "learning_rate": 9.04240252495963e-05, + "loss": 0.7128, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.484195844823364, + "learning_rate": 9.016604158703654e-05, + "loss": 0.6819, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.3832259944118132, + "learning_rate": 8.990812399726435e-05, + "loss": 0.6828, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.41458472472208135, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7577, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.3617464827857215, + "learning_rate": 8.939249396725467e-05, + "loss": 0.7103, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4058552873419775, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7412, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.41965368293910055, + "learning_rate": 8.887714901728551e-05, + "loss": 0.7487, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.44970944641672744, + "learning_rate": 8.861958777576827e-05, + "loss": 0.778, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.3545719381566445, + "learning_rate": 8.836210299741346e-05, + "loss": 0.6984, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.3790480088341582, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6866, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.3852953689419974, + "learning_rate": 8.784736974966135e-05, + "loss": 0.7205, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.36277085602796183, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6941, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.414706108808844, + "learning_rate": 8.733296310764611e-05, + "loss": 0.7056, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.4476749836974746, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7337, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.40156586271414774, + "learning_rate": 8.6818896896207e-05, + "loss": 0.7279, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.36420512506545966, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7013, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.4016790412811444, + "learning_rate": 8.63051849310342e-05, + "loss": 0.7029, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.4071846828907595, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7902, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.4193000771027037, + "learning_rate": 8.579184101829734e-05, + "loss": 0.7654, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.3475117889941879, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6732, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.35458055124710486, + "learning_rate": 8.527887895427454e-05, + "loss": 0.6897, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.38841476335998537, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7214, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.45433014014250656, + "learning_rate": 8.476631252498162e-05, + "loss": 0.7998, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3586743870703551, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6571, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.35312943544102154, + "learning_rate": 8.425415550580162e-05, + "loss": 0.6652, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.39348577997621986, + "learning_rate": 8.399823482681262e-05, + "loss": 0.701, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.3794069943977316, + "learning_rate": 8.374242166111448e-05, + "loss": 0.6863, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.40509681006948783, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6918, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.4064801302224138, + "learning_rate": 8.323112474392731e-05, + "loss": 0.709, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.37515881106172344, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7065, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.42111045551957627, + "learning_rate": 8.272027849550457e-05, + "loss": 0.6952, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.47257730917513174, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7128, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.5796908578294863, + "learning_rate": 8.220989664499878e-05, + "loss": 0.751, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3862793085569805, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7303, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.37681659135460804, + "learning_rate": 8.169999290908188e-05, + "loss": 0.6707, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.38351037595009896, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6837, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.41570489940380556, + "learning_rate": 8.119058099157604e-05, + "loss": 0.6992, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.4070342608417595, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7116, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.37017913236409716, + "learning_rate": 8.068167458308582e-05, + "loss": 0.6707, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.3715292483742941, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6981, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.38306244022912933, + "learning_rate": 8.017328736063006e-05, + "loss": 0.637, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.3518606534199549, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6622, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.3999015657045575, + "learning_rate": 7.966543298727425e-05, + "loss": 0.7308, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4110426055818609, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6783, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.39756580345933124, + "learning_rate": 7.915812511176347e-05, + "loss": 0.7405, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.36916620841185016, + "learning_rate": 7.89046803719267e-05, + "loss": 0.609, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.34784605132280716, + "learning_rate": 7.865137736815535e-05, + "loss": 0.6209, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.40278232284216864, + "learning_rate": 7.839821780235168e-05, + "loss": 0.728, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.3812820287572344, + "learning_rate": 7.814520337545406e-05, + "loss": 0.7133, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.3620196046589468, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6592, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.4142686294295778, + "learning_rate": 7.763961673724379e-05, + "loss": 0.7162, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.38944524715253764, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6986, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.3598739842145071, + "learning_rate": 7.713463104132345e-05, + "loss": 0.6821, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.40094634739098434, + "learning_rate": 7.688236778850306e-05, + "loss": 0.67, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.41535343948306597, + "learning_rate": 7.663025985934158e-05, + "loss": 0.7362, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.41051936650403853, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6998, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.4414551091260474, + "learning_rate": 7.61265167464313e-05, + "loss": 0.8053, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.37054674845555746, + "learning_rate": 7.587488494725157e-05, + "loss": 0.662, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.3803653850905569, + "learning_rate": 7.562341524084623e-05, + "loss": 0.6725, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.4279603552936147, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7433, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.39183934094472184, + "learning_rate": 7.512096886359664e-05, + "loss": 0.6898, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.44355466321467885, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6637, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.37384479459881104, + "learning_rate": 7.461919111808595e-05, + "loss": 0.7046, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.42085403164939966, + "learning_rate": 7.43685571971426e-05, + "loss": 0.772, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.3647244240418084, + "learning_rate": 7.411809548974792e-05, + "loss": 0.6847, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.37293835326603875, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6968, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.4089421010509937, + "learning_rate": 7.361769544568425e-05, + "loss": 0.7072, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4180382473394056, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7906, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.3979438751592118, + "learning_rate": 7.311800443430251e-05, + "loss": 0.7198, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.39928123637778606, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7031, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.3674395446163755, + "learning_rate": 7.26190358849548e-05, + "loss": 0.6569, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3859486385750213, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7024, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.4276363677168132, + "learning_rate": 7.212080320757695e-05, + "loss": 0.7025, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4880835991048266, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7482, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.36705791881157884, + "learning_rate": 7.162331979232783e-05, + "loss": 0.7005, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.43099468027721016, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7631, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.36758250425564015, + "learning_rate": 7.112659900922976e-05, + "loss": 0.6947, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.36867475791558263, + "learning_rate": 7.087852877727481e-05, + "loss": 0.694, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.36077953534345314, + "learning_rate": 7.06306542078091e-05, + "loss": 0.6616, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.49265019267157223, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7989, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.357291492711498, + "learning_rate": 7.013549871673736e-05, + "loss": 0.6781, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.43128032811348194, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7786, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.4351649930566372, + "learning_rate": 6.964114584347316e-05, + "loss": 0.7996, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3854285270860244, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6873, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.4222448806722589, + "learning_rate": 6.914760887390452e-05, + "loss": 0.7686, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.3437253460142556, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6542, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.3819110837227683, + "learning_rate": 6.865490107199181e-05, + "loss": 0.6832, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3574634348965136, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6163, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.3958268568472108, + "learning_rate": 6.816303567941112e-05, + "loss": 0.7076, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.3876078568257905, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6768, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.3543407877240521, + "learning_rate": 6.767202591519875e-05, + "loss": 0.6646, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.5169624011043301, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7584, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.4080700210190037, + "learning_rate": 6.718188497539554e-05, + "loss": 0.6894, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.39850725465254166, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7047, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.3929053806516543, + "learning_rate": 6.669262603269246e-05, + "loss": 0.6746, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3855220091053093, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6719, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.372356925574433, + "learning_rate": 6.620426223607654e-05, + "loss": 0.6594, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.38465464803819926, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6652, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.40130152781871936, + "learning_rate": 6.571680671047749e-05, + "loss": 0.6937, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.4351001554035714, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7249, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.4115902223631893, + "learning_rate": 6.523027255641493e-05, + "loss": 0.7151, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4121347859261653, + "learning_rate": 6.498735508086093e-05, + "loss": 0.702, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.38667507587206656, + "learning_rate": 6.474467284964634e-05, + "loss": 0.678, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3844946676003662, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6724, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.4415379812057678, + "learning_rate": 6.426002064081565e-05, + "loss": 0.7513, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.39192207589977685, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6651, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.3389566163968938, + "learning_rate": 6.377632895510248e-05, + "loss": 0.6258, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.44741155883279, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7784, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.4052730316786015, + "learning_rate": 6.329361079187199e-05, + "loss": 0.7085, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.40419389102631725, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7488, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.3758590354481773, + "learning_rate": 6.281187912432587e-05, + "loss": 0.681, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.33656533436835895, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6377, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.4248891720992723, + "learning_rate": 6.233114689915316e-05, + "loss": 0.7404, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.5083490920128887, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7262, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.4226612634135987, + "learning_rate": 6.18514270361827e-05, + "loss": 0.6931, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.37781319738057323, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6793, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.36609240474136434, + "learning_rate": 6.13727324280358e-05, + "loss": 0.6842, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4510442700857087, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6874, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.410218508042047, + "learning_rate": 6.08950759397797e-05, + "loss": 0.6726, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.36784783267298815, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6728, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.3508863599433168, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.6246, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.3861635430903922, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6916, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.39761478038483955, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.6865, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4174642749474727, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6636, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.3970447301181033, + "learning_rate": 5.946846342446214e-05, + "loss": 0.6712, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.4185172143582579, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7518, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.42893284870193243, + "learning_rate": 5.899508750327501e-05, + "loss": 0.7415, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.34074490348733405, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6421, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.4185296047112624, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.7498, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.37522628453525303, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6293, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.4732993099724249, + "learning_rate": 5.80516544129337e-05, + "loss": 0.7681, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.45813852136378314, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7215, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.3951081586207811, + "learning_rate": 5.758162259883867e-05, + "loss": 0.7173, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3519802287916903, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6131, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.37336416938345224, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.6599, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.34585353519014184, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6665, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.38433583340493543, + "learning_rate": 5.664499159372017e-05, + "loss": 0.7129, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.36033709634244937, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6903, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.35504478143908713, + "learning_rate": 5.617841757494762e-05, + "loss": 0.7094, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.40254939614920054, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.705, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.3819902029976929, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.6737, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.33563599565287516, + "learning_rate": 5.54807686792933e-05, + "loss": 0.5854, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.38048738975216345, + "learning_rate": 5.524881520125229e-05, + "loss": 0.696, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.39463923524519234, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6579, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.36919831664676683, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.6612, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.40099308760264113, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7229, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.37460079877045543, + "learning_rate": 5.432402360355615e-05, + "loss": 0.6234, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4774541551947916, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7966, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.7561719087296993, + "learning_rate": 5.386346293357242e-05, + "loss": 0.7106, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.4944294627944639, + "learning_rate": 5.363364680146725e-05, + "loss": 0.7827, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.3988663927068123, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.691, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.387011407683691, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6942, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.37000355400540574, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.6871, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3502077070269879, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6395, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.3618854175964115, + "learning_rate": 5.248926987065417e-05, + "loss": 0.6422, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4110760311370138, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7164, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.3812106705856821, + "learning_rate": 5.203374286747158e-05, + "loss": 0.6677, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.3961513132352477, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6548, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.44302798222292566, + "learning_rate": 5.15795049724435e-05, + "loss": 0.7168, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.3259426827412798, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6102, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.43795911251231234, + "learning_rate": 5.112656839335543e-05, + "loss": 0.6708, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4003310878287411, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6586, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.4053910786415956, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.6586, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3598752059918062, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6041, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.5320693967787626, + "learning_rate": 5.022464783894744e-05, + "loss": 0.6331, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.41893315449125795, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6564, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.374339592096868, + "learning_rate": 4.977568810302432e-05, + "loss": 0.6935, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3520734439758967, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6039, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.42883636780261936, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.6703, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.42231638922587356, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6704, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.40143325519452655, + "learning_rate": 4.88818300430819e-05, + "loss": 0.6424, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.38747811662920445, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6868, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.4437868087919298, + "learning_rate": 4.843695574177737e-05, + "loss": 0.778, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.40466048504673857, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6558, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.38222637554375055, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.7072, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.49677925510795323, + "learning_rate": 4.777224634018732e-05, + "loss": 0.7291, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.4326515984567661, + "learning_rate": 4.755137637685979e-05, + "loss": 0.6775, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.3734136131599116, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6105, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.3914199028046532, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.6766, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.37051539308134473, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6432, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.40630829200241264, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.6693, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.38278665269565504, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6326, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.4713791334980711, + "learning_rate": 4.623360864173893e-05, + "loss": 0.6961, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.41525455685034257, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6906, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.47351257421378523, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6856, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.4117121674376699, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6759, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.4065285849628081, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.6924, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3867487613641489, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6441, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.4505060652300184, + "learning_rate": 4.492884557078688e-05, + "loss": 0.6634, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.400603244643206, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6884, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.37652240477713944, + "learning_rate": 4.449686911058992e-05, + "loss": 0.6583, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.38526985535595526, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6328, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.423397196617986, + "learning_rate": 4.406638431438576e-05, + "loss": 0.6883, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.3701394363125963, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6213, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.3921608532534879, + "learning_rate": 4.36374027515878e-05, + "loss": 0.7311, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.39700260154913886, + "learning_rate": 4.342347928711953e-05, + "loss": 0.7046, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.4149741300992126, + "learning_rate": 4.320993595120969e-05, + "loss": 0.7112, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.43756902112109974, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.7412, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.3866813232683912, + "learning_rate": 4.278399540155536e-05, + "loss": 0.6727, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.41430265524677684, + "learning_rate": 4.257160104963696e-05, + "loss": 0.722, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.3966499810715081, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.6674, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.39833178178931555, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6995, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.4159629589849531, + "learning_rate": 4.193673880223339e-05, + "loss": 0.7203, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.401390774208145, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6434, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.4043627983147242, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.7046, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4324011450674324, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6595, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.37240511002958815, + "learning_rate": 4.109572403415386e-05, + "loss": 0.6738, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4070645096267617, + "learning_rate": 4.088645623801534e-05, + "loss": 0.677, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.38932169604234196, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.6803, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.37185870326470627, + "learning_rate": 4.046911357233343e-05, + "loss": 0.678, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.36611981750305955, + "learning_rate": 4.026104150684835e-05, + "loss": 0.6722, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3442390261507083, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6463, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.4968410757107018, + "learning_rate": 3.984610290059467e-05, + "loss": 0.8251, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.37161377786132654, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6658, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.35375458841986335, + "learning_rate": 3.943278094912946e-05, + "loss": 0.5788, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3754231113757112, + "learning_rate": 3.922672969194686e-05, + "loss": 0.7152, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.3758166178101138, + "learning_rate": 3.902108676060937e-05, + "loss": 0.6908, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3804299058221594, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6545, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.3654851011954076, + "learning_rate": 3.861103139944449e-05, + "loss": 0.6307, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.44475715948671113, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7652, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.3950888083795495, + "learning_rate": 3.820262588600074e-05, + "loss": 0.6678, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3437194647641541, + "learning_rate": 3.79990452539225e-05, + "loss": 0.5405, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.3479004406263855, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.5977, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.40146136196053567, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6936, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.41475881491378036, + "learning_rate": 3.739080826174498e-05, + "loss": 0.7502, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4029168618382945, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6722, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.3924302456099913, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.7086, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.36298594811321255, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6135, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.325462761884952, + "learning_rate": 3.658572115866541e-05, + "loss": 0.5866, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.3821441445340449, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6541, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.39807602958990873, + "learning_rate": 3.618572862711247e-05, + "loss": 0.7102, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.40302811259743876, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6601, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.4090597729618086, + "learning_rate": 3.578745112405083e-05, + "loss": 0.7116, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4242164133707293, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6933, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.381356552185187, + "learning_rate": 3.539089935331294e-05, + "loss": 0.701, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.42053075164360254, + "learning_rate": 3.519327394983888e-05, + "loss": 0.7176, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.43103610147891575, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.731, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.426687475459895, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6651, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.34436746852459027, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.6021, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.3322724494396109, + "learning_rate": 3.440713983000601e-05, + "loss": 0.5933, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.46660288239031206, + "learning_rate": 3.421170477595419e-05, + "loss": 0.7253, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.39972170697874304, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6955, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.4484926930912154, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.7472, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.37842392041542816, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6795, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.33310293753759246, + "learning_rate": 3.34343978560367e-05, + "loss": 0.5907, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.4084447231786101, + "learning_rate": 3.324118597838464e-05, + "loss": 0.683, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.37544847644307205, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.7079, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.4048292248179034, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6581, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.39325536956079654, + "learning_rate": 3.266424677350346e-05, + "loss": 0.7142, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.387008656575126, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6504, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.40176436480323197, + "learning_rate": 3.228188057393895e-05, + "loss": 0.6707, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.35642080243266405, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6398, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.3840829495853328, + "learning_rate": 3.190133432000252e-05, + "loss": 0.7108, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.34226618771386697, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.5883, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.3730526708966016, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.6561, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5133943423547346, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7501, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.45245367415824433, + "learning_rate": 3.114574250902558e-05, + "loss": 0.7075, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.38279937588237534, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6271, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.48128672007504014, + "learning_rate": 3.077071725875116e-05, + "loss": 0.7937, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.38337871736257334, + "learning_rate": 3.058390171511196e-05, + "loss": 0.671, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.5835878948960984, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.5881, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.3869196656143522, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6597, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.40884136149356204, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.6587, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3643087609647497, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.619, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.38046995770456293, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.6563, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.3673923745337542, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6593, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.3903864667528407, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.6201, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4334368171640703, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7329, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.4470775758297611, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.7725, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.36984125610935664, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6816, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.393785262217113, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.675, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3862927167505613, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6531, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.3991015244488483, + "learning_rate": 2.819819423336775e-05, + "loss": 0.7225, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.4217027050575871, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6542, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.424375649472465, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.7528, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3879980379001011, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6424, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.4419791925895399, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.7411, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.560057032805807, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6546, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.4247675870095834, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6494, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.3762748956971542, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6904, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.3702584404711336, + "learning_rate": 2.677041764010988e-05, + "loss": 0.6745, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.38356258934630244, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6497, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.40398104418654535, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.6966, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.39109643003107714, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6976, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.42202229115813555, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.6624, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.39471222639644493, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7129, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.38103435763156235, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.7002, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3891360443952935, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6737, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.3891741702846402, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.6595, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.39202566812660267, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.714, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.40870840104373474, + "learning_rate": 2.503004759861258e-05, + "loss": 0.709, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.34838708575276983, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6257, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.39229349183084955, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.6919, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3885075596834621, + "learning_rate": 2.451770608467432e-05, + "loss": 0.68, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.3653202740567082, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.6456, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.38122600677793544, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6755, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.40469846986495295, + "learning_rate": 2.400992893100822e-05, + "loss": 0.7281, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.3874646319132377, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6987, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.38652009318610764, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.6498, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.4195905775332029, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6878, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.3332753764965799, + "learning_rate": 2.334004587234717e-05, + "loss": 0.631, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.4424376017968654, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6776, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.3501051595988133, + "learning_rate": 2.300819024631603e-05, + "loss": 0.5838, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.40590581314438823, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.7233, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.3865285519165861, + "learning_rate": 2.26784037992395e-05, + "loss": 0.6915, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3729463126646092, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6419, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.3891660028099139, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.6574, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.37516748591806176, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.694, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.3627710948078153, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.6426, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.34430677201245696, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6224, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.34143260859065744, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.616, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.42325194237531244, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6802, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.374011082379245, + "learning_rate": 2.138012622361689e-05, + "loss": 0.6682, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3924664703808738, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6557, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.3453172403567277, + "learning_rate": 2.106081749751897e-05, + "loss": 0.6273, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3709064056768632, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.675, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.409732990870048, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.7157, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.367012562447277, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6951, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.3509939591651848, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.65, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.44200231466098977, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6547, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.40441240990297683, + "learning_rate": 2.011565445123711e-05, + "loss": 0.7347, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.38852090414582974, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6448, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.37208957771394713, + "learning_rate": 1.980488270378612e-05, + "loss": 0.6137, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.36708020272901665, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6749, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.3866779440010189, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.6213, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4388966309440129, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7146, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.4163581109381185, + "learning_rate": 1.918981330958678e-05, + "loss": 0.6568, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.34924193907392403, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6308, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.38487892629432047, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.658, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.419568928978815, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7342, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.4824537645721057, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.8479, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4886160452022071, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.654, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.3899441504671415, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.7006, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.41500226449489414, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6719, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.36673090297500144, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.6516, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.46624315341238187, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6608, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.4046771077568921, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.7124, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.45519910379364, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6935, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.4163406513330448, + "learning_rate": 1.739698775823442e-05, + "loss": 0.6798, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.3847504340934355, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6495, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.3837095644385684, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.5916, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.3798862260824479, + "learning_rate": 1.696120172352025e-05, + "loss": 0.5924, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.4342551838691338, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.7215, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3891573762790059, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6261, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.39374066114535394, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.737, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4235606753007906, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7349, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.38219200505475237, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.6664, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4168244866608693, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6793, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.37050304408128276, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.6408, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.7747700697770986, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7048, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.4379270226232551, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.6912, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.38641061387438236, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6661, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.3699145427107198, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.5911, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.37678064603454414, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6379, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.38276573145225384, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.6688, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.35659385165223567, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6418, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.4385984497720718, + "learning_rate": 1.485810737340767e-05, + "loss": 0.7101, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.37131113697001, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6694, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.39251911154546165, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.6851, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.39604885394823114, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6319, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.40946975711386363, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.6883, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3962931848637794, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7025, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.35824411852444177, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.647, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.44805131354996924, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7499, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.37839428607888675, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.6739, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4316955359945715, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6555, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.485425163010363, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.7962, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4265406221697592, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7392, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.3996657451117564, + "learning_rate": 1.326814704364262e-05, + "loss": 0.7219, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.33785709813005427, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6191, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.3986088500140371, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.7453, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.4304343313428344, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.7237, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.33128590330089164, + "learning_rate": 1.275673273546758e-05, + "loss": 0.5727, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3834852029532719, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6106, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.4003658469139451, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.739, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.39407968931740417, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6444, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.3795392149229405, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.6499, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.36122921429516935, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6214, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.35053543406634635, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.6659, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.37714585249730426, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6293, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.3355620267539819, + "learning_rate": 1.176209418012495e-05, + "loss": 0.6256, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4178624387608608, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6817, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.4087431841898358, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.7479, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4338977119500379, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6792, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.4014367638097192, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.6514, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.36813594794623244, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.635, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.5653983945250799, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.7264, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3957528246204374, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6174, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.41306059413718516, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.6637, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.3510167161446504, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6491, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.3266469766942705, + "learning_rate": 1.057219974130903e-05, + "loss": 0.6036, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.39283761058480665, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6192, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.3823300008834905, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.6677, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.38639058342171195, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7045, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.34967644464562886, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.6394, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.36660670673179757, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6443, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.4103450467697293, + "learning_rate": 9.887052838721322e-06, + "loss": 0.693, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.3729181374994256, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6148, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.380542232685799, + "learning_rate": 9.663506046162985e-06, + "loss": 0.6933, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.3801809923588539, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6373, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.4290350941082846, + "learning_rate": 9.44238707511862e-06, + "loss": 0.6806, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.40742883431682103, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6822, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.3990060719895597, + "learning_rate": 9.22370186822965e-06, + "loss": 0.6832, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.42772499475750525, + "learning_rate": 9.115273765538202e-06, + "loss": 0.757, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.4410430739892881, + "learning_rate": 9.0074563027294e-06, + "loss": 0.687, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.37499319503363765, + "learning_rate": 8.900250204211514e-06, + "loss": 0.7777, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.3857697346139549, + "learning_rate": 8.79365619028507e-06, + "loss": 0.6491, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4005388401587205, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6342, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.4245050931906198, + "learning_rate": 8.582307276841462e-06, + "loss": 0.6503, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.39072486114874533, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6484, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.3370705386163272, + "learning_rate": 8.37341524246672e-06, + "loss": 0.6533, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4577072302250668, + "learning_rate": 8.269892311900696e-06, + "loss": 0.7518, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.3865547149956887, + "learning_rate": 8.166985701199582e-06, + "loss": 0.6319, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.4037353871825628, + "learning_rate": 8.064696101776358e-06, + "loss": 0.7181, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.3893878280163203, + "learning_rate": 7.963024200898462e-06, + "loss": 0.6872, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.4353209478547202, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6773, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.443063602249416, + "learning_rate": 7.761536223092458e-06, + "loss": 0.6929, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.3672377957442898, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6417, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.4741654482655923, + "learning_rate": 7.562527182833978e-06, + "loss": 0.7018, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.35363508243119146, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6645, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.43547070255930115, + "learning_rate": 7.366002428553153e-06, + "loss": 0.6793, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.4127577831340952, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.663, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.3984495801609078, + "learning_rate": 7.171967241914224e-06, + "loss": 0.6869, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4264269134271497, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6845, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.4475181742562311, + "learning_rate": 6.980426837673437e-06, + "loss": 0.7188, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.38976354506503263, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6712, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.4072056344974993, + "learning_rate": 6.791386363539065e-06, + "loss": 0.7189, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3814969538983557, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.622, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.39198374184146784, + "learning_rate": 6.604850900032955e-06, + "loss": 0.72, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.4299031653523382, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6712, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.42062112658601636, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6634, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.381095188428428, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6104, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.3740011730979217, + "learning_rate": 6.239314990243339e-06, + "loss": 0.6459, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3482311112245457, + "learning_rate": 6.149504395842087e-06, + "loss": 0.5719, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.34026593781578274, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.5916, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4443275218242843, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6623, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.37590807547566096, + "learning_rate": 5.883858403607967e-06, + "loss": 0.6557, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.41442118569066855, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7623, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.35148898910324156, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.6086, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.41219269590085295, + "learning_rate": 5.623903547074549e-06, + "loss": 0.7074, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.38394476824780704, + "learning_rate": 5.538519351897575e-06, + "loss": 0.7202, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4086093487566508, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6803, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.40780618120587225, + "learning_rate": 5.369655545525909e-06, + "loss": 0.6717, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.40160835831526687, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6519, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.42618142278286036, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.7201, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4495467272165354, + "learning_rate": 5.121129773156663e-06, + "loss": 0.7003, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.40258287954211475, + "learning_rate": 5.039562062965508e-06, + "loss": 0.7064, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.38701138060079804, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6509, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.3918323584972186, + "learning_rate": 4.87834125814235e-06, + "loss": 0.6932, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.4267477243262357, + "learning_rate": 4.798689246727006e-06, + "loss": 0.7286, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.4042167728382733, + "learning_rate": 4.719676877632639e-06, + "loss": 0.6026, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3800089340729763, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6303, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.40746341645128786, + "learning_rate": 4.563573185591219e-06, + "loss": 0.616, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.45472749983902716, + "learning_rate": 4.486482911479839e-06, + "loss": 0.7022, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.38580009315205044, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.6165, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.36183809853024934, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6026, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.3973413421604641, + "learning_rate": 4.259064579323302e-06, + "loss": 0.6289, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.43386908181467376, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6912, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.40332765590901476, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.6324, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.3287877487873551, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5785, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.400587936119373, + "learning_rate": 3.964848174174541e-06, + "loss": 0.7204, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.37522794345615756, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6462, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.3804765757964083, + "learning_rate": 3.821609474213983e-06, + "loss": 0.6166, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.34944340362908555, + "learning_rate": 3.750959195463466e-06, + "loss": 0.5801, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.3725022787413, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.5849, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.3834493715771417, + "learning_rate": 3.611599153858214e-06, + "loss": 0.632, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.40476102820663795, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.6604, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.34598749177431276, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.5964, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.37544762752433286, + "learning_rate": 3.40741737109318e-06, + "loss": 0.6738, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.3688018050984644, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6182, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.3762250347026776, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.6341, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.372296414482718, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6501, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.42738576169972864, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.67, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.40312706215282335, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6839, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.36720059452943676, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.6225, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3868428302824951, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6697, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.33361508488679614, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.6161, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3709586419165963, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6492, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.40523771678934845, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6375, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.40686652277172136, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7814, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.4035953631570672, + "learning_rate": 2.649217248223468e-06, + "loss": 0.7262, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.38453505690308615, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6721, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.4893722918508808, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6595, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3796131533890088, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.618, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.389940702523096, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.6717, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3780420711864244, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6411, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.3766267031435862, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.6437, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.39133134402719205, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6361, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.3674878182398129, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.6219, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.41221470805706845, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7125, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.42060309637275045, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.7036, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4314788318508879, + "learning_rate": 2.036919225091827e-06, + "loss": 0.7002, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.3687559261760369, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.6284, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.40892865584466087, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7167, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.3927527761081754, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.6535, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.45343822891686525, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6567, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.3987423062295389, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.7027, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.34101337590193037, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6058, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.37467888023325496, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6611, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.4279880666384542, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6429, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.4008990653373971, + "learning_rate": 1.595161589389449e-06, + "loss": 0.6791, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.4040976062647741, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6614, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.40009160645320296, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.6184, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3584149652915633, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6534, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.42308879251991566, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.6429, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.3530676336565875, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.5875, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.45952337480543376, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.7039, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4000313956953184, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6528, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.3390785776944779, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.5444, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.4164879554122682, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.7395, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.4018332369116052, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.7091, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.401606509590131, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.7054, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.4385767023072862, + "learning_rate": 1.089491988176017e-06, + "loss": 0.6476, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.3793691830981734, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6676, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.3992745703002846, + "learning_rate": 1.014505010326583e-06, + "loss": 0.6818, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.38130181064017943, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6855, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.31922457047948377, + "learning_rate": 9.421782985976068e-07, + "loss": 0.5842, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4284172920959308, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6487, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.3492126493938296, + "learning_rate": 8.725137967920738e-07, + "loss": 0.6214, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.40249258176511293, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7139, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.40261627692051555, + "learning_rate": 8.055133771652345e-07, + "loss": 0.6671, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.37974840778161195, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6705, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.4246235923407765, + "learning_rate": 7.411788403743237e-07, + "loss": 0.5937, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.37635799514836554, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6311, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.3853814024029868, + "learning_rate": 6.7951191543012e-07, + "loss": 0.6393, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.41252531259882763, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6276, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.45897883538473566, + "learning_rate": 6.205142596505176e-07, + "loss": 0.7155, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.4379137702549422, + "learning_rate": 5.920169059947411e-07, + "loss": 0.7196, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.34869941274882527, + "learning_rate": 5.64187458615939e-07, + "loss": 0.6355, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.38492263243996416, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7184, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.3880733678371908, + "learning_rate": 5.105330261267916e-07, + "loss": 0.6957, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.405049813155491, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7065, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.44029289611675254, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.6582, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.44664165480840895, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6719, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.35226701230834057, + "learning_rate": 4.112469628438365e-07, + "loss": 0.6235, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3593587552529369, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.581, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.4087204707252798, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.6853, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3891805891204021, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6547, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.41121914812015825, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.7006, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.3629778752294901, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6327, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.4013955562090015, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.6526, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4013227646872803, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6855, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.5205466487643553, + "learning_rate": 2.448018893333681e-07, + "loss": 0.6296, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.44612035490967855, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6549, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.42556267254277397, + "learning_rate": 2.098903854912515e-07, + "loss": 0.6728, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3732850799010509, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6635, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.3907179778195809, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.6396, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.38192502548447554, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6523, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.3682024172847496, + "learning_rate": 1.481139151579991e-07, + "loss": 0.629, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.3886285664402178, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6777, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.37918581250956984, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6488, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3257783036245148, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5451, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.41977397713779263, + "learning_rate": 9.707157531134713e-08, + "loss": 0.6171, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.41724976693934324, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6751, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.4533361462180586, + "learning_rate": 7.557746412468758e-08, + "loss": 0.6946, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.38325422753154753, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6017, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.3421681867758102, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6061, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.39514516668517125, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6764, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.35344840546408507, + "learning_rate": 4.064624751394242e-08, + "loss": 0.6253, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.40727430718765384, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6221, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.4688219114010321, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.7087, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.375242685302969, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.7089, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.3607480887273893, + "learning_rate": 1.646071422083395e-08, + "loss": 0.5905, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.46496149960965294, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6538, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.5837228889124457, + "learning_rate": 8.398436437317969e-09, + "loss": 0.6289, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.42242509832078073, + "learning_rate": 5.375026405352035e-09, + "loss": 0.694, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.39721952854255504, + "learning_rate": 3.023464202944748e-09, + "loss": 0.706, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3939331671218691, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6699, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.47584973128656116, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.6701, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.43754064618906224, + "learning_rate": 0.0, + "loss": 0.6158, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1127522763669504.0, + "train_loss": 0.7323800681114196, + "train_runtime": 19553.3332, + "train_samples_per_second": 1.023, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1127522763669504.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b8a71b16dd6a1ff6fe91480e5f7898dcadf8b92 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "down_proj", + "v_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b390b5ad1061169f9a00619f9abaf18c1c9279e4 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f971eca109ad6061df4d9b33538c61d8335775627482a094aab237718af7919 +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..e10de482eb63139299526970c6edc3d9db9614d9 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb1f04035eeab132dae7260e49c61a1c263972fd2515a86d7ab2d46b030ee2f4 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..97c2e6281b4ceef0b9c6f6ffbd0c87030236670d --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.740625375259329, + "learning_rate": 5.263157894736842e-06, + "loss": 1.2251, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.9552990309915874, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4193, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 0.9569001483724141, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.397, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.7823732381672138, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3412, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.7645083869407853, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.3502, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.6154109919230503, + "learning_rate": 3.157894736842105e-05, + "loss": 1.0955, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.8123309890741669, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.2582, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6726504133934107, + "learning_rate": 4.210526315789474e-05, + "loss": 1.0359, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.6791240317261311, + "learning_rate": 4.736842105263158e-05, + "loss": 1.0694, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 0.6903249153403939, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.9503, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.7348336374289809, + "learning_rate": 5.789473684210527e-05, + "loss": 1.0206, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7849115277282124, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0129, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.5834940596241346, + "learning_rate": 6.842105263157895e-05, + "loss": 0.9385, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.6080288380250879, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9041, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.6289182782111001, + "learning_rate": 7.894736842105263e-05, + "loss": 0.95, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5246426592790335, + "learning_rate": 8.421052631578948e-05, + "loss": 0.8953, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.571697601687305, + "learning_rate": 8.947368421052632e-05, + "loss": 0.9473, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.4695061930640392, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9375, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.49104776110117504, + "learning_rate": 0.0001, + "loss": 0.8592, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.557235516915619, + "learning_rate": 0.00010526315789473685, + "loss": 0.9125, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.4773950173485819, + "learning_rate": 0.0001105263157894737, + "loss": 0.861, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.4647782924155993, + "learning_rate": 0.00011578947368421053, + "loss": 0.8597, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5478978441950723, + "learning_rate": 0.00012105263157894738, + "loss": 0.9283, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5763142218308771, + "learning_rate": 0.0001263157894736842, + "loss": 0.9972, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.5420229317803669, + "learning_rate": 0.00013157894736842108, + "loss": 0.9065, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5407268561510891, + "learning_rate": 0.0001368421052631579, + "loss": 0.8659, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.488056013662824, + "learning_rate": 0.00014210526315789474, + "loss": 0.8485, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.476656380319156, + "learning_rate": 0.00014736842105263158, + "loss": 0.8913, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.5546409263632991, + "learning_rate": 0.00015263157894736845, + "loss": 0.9313, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.540416485945182, + "learning_rate": 0.00015789473684210527, + "loss": 0.9657, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.43395697982419223, + "learning_rate": 0.0001631578947368421, + "loss": 0.8576, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.46963036387995893, + "learning_rate": 0.00016842105263157895, + "loss": 0.8423, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.45539400239698086, + "learning_rate": 0.0001736842105263158, + "loss": 0.798, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.4606712595303784, + "learning_rate": 0.00017894736842105264, + "loss": 0.8476, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.5017330108056747, + "learning_rate": 0.00018421052631578948, + "loss": 0.8688, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4274065338288124, + "learning_rate": 0.00018947368421052632, + "loss": 0.8116, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.47603841684019044, + "learning_rate": 0.00019473684210526317, + "loss": 0.8709, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.46980138691953705, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.4445247338504254, + "learning_rate": 0.00019999966405802826, + "loss": 0.8094, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.5242809775621773, + "learning_rate": 0.00019999865623437013, + "loss": 0.9212, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.4651424134939184, + "learning_rate": 0.00019999697653579705, + "loss": 0.8692, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.449789261867823, + "learning_rate": 0.00019999462497359466, + "loss": 0.834, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.5203812836903858, + "learning_rate": 0.0001999916015635627, + "loss": 0.913, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.4663304312307797, + "learning_rate": 0.00019998790632601496, + "loss": 0.8124, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.4868169772295852, + "learning_rate": 0.00019998353928577919, + "loss": 0.8797, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.3790460899489488, + "learning_rate": 0.0001999785004721968, + "loss": 0.7733, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.45106360736919526, + "learning_rate": 0.0001999727899191228, + "loss": 0.887, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.408512272799759, + "learning_rate": 0.00019996640766492543, + "loss": 0.7955, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.4292310323355621, + "learning_rate": 0.00019995935375248606, + "loss": 0.796, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.5760647088894424, + "learning_rate": 0.00019995162822919883, + "loss": 0.858, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.46819810055070243, + "learning_rate": 0.00019994323114697022, + "loss": 0.8403, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5057652506692215, + "learning_rate": 0.00019993416256221895, + "loss": 0.8328, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.5246022695539894, + "learning_rate": 0.0001999244225358753, + "loss": 0.8986, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5058257344249311, + "learning_rate": 0.00019991401113338104, + "loss": 0.8703, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.4501909459351609, + "learning_rate": 0.00019990292842468868, + "loss": 0.8701, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.46649398621078, + "learning_rate": 0.00019989117448426108, + "loss": 0.8789, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.3865589497205679, + "learning_rate": 0.0001998787493910712, + "loss": 0.762, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.47464467097697066, + "learning_rate": 0.00019986565322860115, + "loss": 0.932, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.4014322315447128, + "learning_rate": 0.000199851886084842, + "loss": 0.7688, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.503124900204115, + "learning_rate": 0.00019983744805229296, + "loss": 0.8828, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.4408221447128362, + "learning_rate": 0.00019982233922796085, + "loss": 0.8346, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5167445238638355, + "learning_rate": 0.00019980655971335945, + "loss": 0.9163, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.4685207059609607, + "learning_rate": 0.00019979010961450878, + "loss": 0.8311, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.40331836132027354, + "learning_rate": 0.00019977298904193437, + "loss": 0.7652, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.4697315346699755, + "learning_rate": 0.00019975519811066663, + "loss": 0.9228, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.45021848631012573, + "learning_rate": 0.00019973673694024, + "loss": 0.8314, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.5515640221319575, + "learning_rate": 0.0001997176056546921, + "loss": 0.8923, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4826389903930987, + "learning_rate": 0.00019969780438256293, + "loss": 0.8487, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.510083073111606, + "learning_rate": 0.0001996773332568941, + "loss": 0.8928, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.43786463946699905, + "learning_rate": 0.0001996561924152278, + "loss": 0.8515, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.470892478542731, + "learning_rate": 0.00019963438199960599, + "loss": 0.9231, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5640542733886403, + "learning_rate": 0.0001996119021565693, + "loss": 0.8723, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.4371591858764352, + "learning_rate": 0.00019958875303715615, + "loss": 0.889, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.41219541736099313, + "learning_rate": 0.0001995649347969019, + "loss": 0.7701, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.4378002235683173, + "learning_rate": 0.0001995404475958373, + "loss": 0.8083, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4882060227684013, + "learning_rate": 0.00019951529159848805, + "loss": 0.8964, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.4418048035941677, + "learning_rate": 0.0001994894669738732, + "loss": 0.8203, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4252195168566824, + "learning_rate": 0.00019946297389550433, + "loss": 0.7761, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.41113673138492074, + "learning_rate": 0.0001994358125413841, + "loss": 0.7589, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.43688240390590394, + "learning_rate": 0.00019940798309400526, + "loss": 0.813, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.48937772744416697, + "learning_rate": 0.0001993794857403495, + "loss": 0.9127, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.48790047206375037, + "learning_rate": 0.0001993503206718859, + "loss": 0.8357, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.44384417196333376, + "learning_rate": 0.0001993204880845699, + "loss": 0.7833, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4707494051494073, + "learning_rate": 0.00019928998817884182, + "loss": 0.8338, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.4104480033958369, + "learning_rate": 0.00019925882115962568, + "loss": 0.7625, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4435412737512262, + "learning_rate": 0.00019922698723632767, + "loss": 0.7803, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.45424275991848656, + "learning_rate": 0.00019919448662283478, + "loss": 0.8729, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4042880429462926, + "learning_rate": 0.00019916131953751342, + "loss": 0.7282, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.42099845264428704, + "learning_rate": 0.00019912748620320794, + "loss": 0.7957, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.5179888598062934, + "learning_rate": 0.00019909298684723904, + "loss": 0.8904, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.41315649406651883, + "learning_rate": 0.00019905782170140238, + "loss": 0.8141, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5039344170883732, + "learning_rate": 0.00019902199100196697, + "loss": 0.9011, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.428932906748187, + "learning_rate": 0.00019898549498967343, + "loss": 0.8359, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.3918990139415261, + "learning_rate": 0.00019894833390973266, + "loss": 0.7698, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.4867307263306567, + "learning_rate": 0.000198910508011824, + "loss": 0.8894, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.3849900373937041, + "learning_rate": 0.00019887201755009357, + "loss": 0.7461, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.44339301042600654, + "learning_rate": 0.00019883286278315262, + "loss": 0.8389, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4403051271804401, + "learning_rate": 0.0001987930439740757, + "loss": 0.873, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.3932316537794687, + "learning_rate": 0.00019875256139039902, + "loss": 0.7839, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.5150303092373243, + "learning_rate": 0.00019871141530411853, + "loss": 0.9516, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.42560379190353087, + "learning_rate": 0.00019866960599168826, + "loss": 0.823, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4733956375036902, + "learning_rate": 0.0001986271337340182, + "loss": 0.8851, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.41629507697959817, + "learning_rate": 0.0001985839988164726, + "loss": 0.772, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4488922942457398, + "learning_rate": 0.00019854020152886814, + "loss": 0.8442, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.4207918504223135, + "learning_rate": 0.00019849574216547171, + "loss": 0.8452, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.3795656815181342, + "learning_rate": 0.0001984506210249986, + "loss": 0.7297, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.47907397510258765, + "learning_rate": 0.00019840483841061058, + "loss": 0.8072, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4142080264062873, + "learning_rate": 0.00019835839462991361, + "loss": 0.8323, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.43349213106384954, + "learning_rate": 0.00019831128999495606, + "loss": 0.8212, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.4670008337658961, + "learning_rate": 0.00019826352482222638, + "loss": 0.8888, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.40100642981445356, + "learning_rate": 0.0001982150994326511, + "loss": 0.8043, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.40081568877954143, + "learning_rate": 0.00019816601415159263, + "loss": 0.7262, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.47132320831200863, + "learning_rate": 0.0001981162693088471, + "loss": 0.8547, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.43664227626852314, + "learning_rate": 0.0001980658652386421, + "loss": 0.8621, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.42978129437132245, + "learning_rate": 0.0001980148022796345, + "loss": 0.8551, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.43994065392023857, + "learning_rate": 0.00019796308077490817, + "loss": 0.7863, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.41214536833658816, + "learning_rate": 0.00019791070107197153, + "loss": 0.8076, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.5037547666595701, + "learning_rate": 0.00019785766352275542, + "loss": 0.7521, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.4028163450562548, + "learning_rate": 0.0001978039684836106, + "loss": 0.7186, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.4341756973283765, + "learning_rate": 0.00019774961631530545, + "loss": 0.7786, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.46218989214333206, + "learning_rate": 0.0001976946073830234, + "loss": 0.8423, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.43517248933936953, + "learning_rate": 0.00019763894205636072, + "loss": 0.7408, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.44795165042482726, + "learning_rate": 0.00019758262070932375, + "loss": 0.7408, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4545616865398823, + "learning_rate": 0.00019752564372032657, + "loss": 0.7962, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.5120527191035785, + "learning_rate": 0.00019746801147218842, + "loss": 0.7847, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.45589451863464087, + "learning_rate": 0.00019740972435213115, + "loss": 0.7998, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.4481327122615246, + "learning_rate": 0.00019735078275177654, + "loss": 0.791, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4251840844480094, + "learning_rate": 0.00019729118706714375, + "loss": 0.8316, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.3628062981318352, + "learning_rate": 0.00019723093769864663, + "loss": 0.7163, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.4805709842925019, + "learning_rate": 0.00019717003505109095, + "loss": 0.895, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.42458589044495515, + "learning_rate": 0.0001971084795336719, + "loss": 0.8219, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4113404858173765, + "learning_rate": 0.00019704627155997108, + "loss": 0.8085, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.4860210093004792, + "learning_rate": 0.00019698341154795389, + "loss": 0.866, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.4681857078909186, + "learning_rate": 0.00019691989991996663, + "loss": 0.8573, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.42375573358728696, + "learning_rate": 0.00019685573710273376, + "loss": 0.7784, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.44734150733905653, + "learning_rate": 0.0001967909235273549, + "loss": 0.7925, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.4461303725232236, + "learning_rate": 0.00019672545962930215, + "loss": 0.8014, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.3949997712538345, + "learning_rate": 0.00019665934584841682, + "loss": 0.8022, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.39244220314624556, + "learning_rate": 0.00019659258262890683, + "loss": 0.7385, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.40176139668836713, + "learning_rate": 0.00019652517041934356, + "loss": 0.7214, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.4314547890333666, + "learning_rate": 0.00019645710967265882, + "loss": 0.7872, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4625915384488709, + "learning_rate": 0.00019638840084614182, + "loss": 0.7953, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.40505205236175806, + "learning_rate": 0.00019631904440143612, + "loss": 0.765, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4623439449351562, + "learning_rate": 0.00019624904080453655, + "loss": 0.8885, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.40620045960175877, + "learning_rate": 0.00019617839052578603, + "loss": 0.7914, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.38591399758921124, + "learning_rate": 0.00019610709403987246, + "loss": 0.7291, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.42371161939980584, + "learning_rate": 0.0001960351518258255, + "loss": 0.8044, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3881700045503985, + "learning_rate": 0.00019596256436701324, + "loss": 0.7109, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.4482791113780307, + "learning_rate": 0.00019588933215113926, + "loss": 0.8399, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.4404341406364202, + "learning_rate": 0.000195815455670239, + "loss": 0.8161, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.4560567252709708, + "learning_rate": 0.00019574093542067673, + "loss": 0.8053, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.48260438414333967, + "learning_rate": 0.00019566577190314197, + "loss": 0.8069, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.42905621721548526, + "learning_rate": 0.0001955899656226464, + "loss": 0.7908, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4141612044013541, + "learning_rate": 0.0001955135170885202, + "loss": 0.7786, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.46782800192143464, + "learning_rate": 0.0001954364268144088, + "loss": 0.8697, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.43284321639254114, + "learning_rate": 0.00019535869531826937, + "loss": 0.8078, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.43673194225207906, + "learning_rate": 0.00019528032312236736, + "loss": 0.7898, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.4300511780384195, + "learning_rate": 0.00019520131075327298, + "loss": 0.7702, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.4165428286489318, + "learning_rate": 0.00019512165874185767, + "loss": 0.7877, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.4008560058497671, + "learning_rate": 0.00019504136762329047, + "loss": 0.7776, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.38935253294393823, + "learning_rate": 0.0001949604379370345, + "loss": 0.6922, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4503657687466216, + "learning_rate": 0.00019487887022684336, + "loss": 0.8197, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.4691887379486971, + "learning_rate": 0.00019479666504075736, + "loss": 0.7924, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3884194106275563, + "learning_rate": 0.00019471382293110003, + "loss": 0.7665, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.394762681258105, + "learning_rate": 0.0001946303444544741, + "loss": 0.7539, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.3985846530495592, + "learning_rate": 0.00019454623017175812, + "loss": 0.7394, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.4643833283764792, + "learning_rate": 0.00019446148064810242, + "loss": 0.7741, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.42613032663048966, + "learning_rate": 0.00019437609645292546, + "loss": 0.7967, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.3985170031600202, + "learning_rate": 0.00019429007815990993, + "loss": 0.7908, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.7356625207849016, + "learning_rate": 0.0001942034263469989, + "loss": 0.7775, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.45094376276448905, + "learning_rate": 0.00019411614159639204, + "loss": 0.7783, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4636435816649117, + "learning_rate": 0.00019402822449454153, + "loss": 0.8484, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.4621198499956239, + "learning_rate": 0.00019393967563214833, + "loss": 0.9057, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.36922838228271354, + "learning_rate": 0.00019385049560415794, + "loss": 0.7374, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.4965688405829738, + "learning_rate": 0.00019376068500975667, + "loss": 0.8498, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3646938881056897, + "learning_rate": 0.00019367024445236754, + "loss": 0.7621, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.3758101689141692, + "learning_rate": 0.000193579174539646, + "loss": 0.7237, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4609739513114896, + "learning_rate": 0.00019348747588347637, + "loss": 0.7893, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.4372103808260089, + "learning_rate": 0.00019339514909996706, + "loss": 0.8498, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.5335801995001441, + "learning_rate": 0.00019330219480944694, + "loss": 0.8743, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.46559554482857823, + "learning_rate": 0.00019320861363646095, + "loss": 0.8128, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.38211339446149645, + "learning_rate": 0.00019311440620976597, + "loss": 0.754, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.38983877939713985, + "learning_rate": 0.00019301957316232658, + "loss": 0.7078, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.35728347883672906, + "learning_rate": 0.0001929241151313108, + "loss": 0.7, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.44890919500408405, + "learning_rate": 0.0001928280327580858, + "loss": 0.7341, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.43143807366697845, + "learning_rate": 0.00019273132668821364, + "loss": 0.81, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.5033546677466464, + "learning_rate": 0.00019263399757144683, + "loss": 0.8697, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.47626916741189934, + "learning_rate": 0.00019253604606172417, + "loss": 0.8703, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.48024450509527555, + "learning_rate": 0.000192437472817166, + "loss": 0.7925, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.42792645932493373, + "learning_rate": 0.00019233827850007027, + "loss": 0.7499, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.4537067692309997, + "learning_rate": 0.00019223846377690754, + "loss": 0.7749, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.44128448938912807, + "learning_rate": 0.00019213802931831696, + "loss": 0.7712, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.4237325523357925, + "learning_rate": 0.00019203697579910154, + "loss": 0.7853, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.44618220922984425, + "learning_rate": 0.00019193530389822363, + "loss": 0.8255, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.412291999280626, + "learning_rate": 0.00019183301429880043, + "loss": 0.8134, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.476265593273783, + "learning_rate": 0.00019173010768809933, + "loss": 0.8385, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.41576625362728403, + "learning_rate": 0.00019162658475753327, + "loss": 0.7821, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4289971257577167, + "learning_rate": 0.0001915224462026563, + "loss": 0.8013, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.39105537877275937, + "learning_rate": 0.00019141769272315858, + "loss": 0.7248, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.3895336927661995, + "learning_rate": 0.00019131232502286188, + "loss": 0.7006, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.48139123386353133, + "learning_rate": 0.00019120634380971496, + "loss": 0.7692, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.42553092787861524, + "learning_rate": 0.0001910997497957885, + "loss": 0.7733, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.4198528337621994, + "learning_rate": 0.0001909925436972706, + "loss": 0.7154, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.47240006700238607, + "learning_rate": 0.00019088472623446183, + "loss": 0.8265, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.42225252570225996, + "learning_rate": 0.00019077629813177036, + "loss": 0.7502, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.3967444589999123, + "learning_rate": 0.00019066726011770726, + "loss": 0.6841, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.4625079056614288, + "learning_rate": 0.00019055761292488142, + "loss": 0.8298, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.48961109504154815, + "learning_rate": 0.0001904473572899947, + "loss": 0.8815, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.43470857997535833, + "learning_rate": 0.00019033649395383702, + "loss": 0.7719, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.4335611546115917, + "learning_rate": 0.00019022502366128135, + "loss": 0.8122, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.4368120206369419, + "learning_rate": 0.00019011294716127867, + "loss": 0.8141, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3850030447261817, + "learning_rate": 0.00019000026520685302, + "loss": 0.7727, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.40477749678485697, + "learning_rate": 0.0001898869785550963, + "loss": 0.7856, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.42982204528288137, + "learning_rate": 0.0001897730879671634, + "loss": 0.7594, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.41021059459830667, + "learning_rate": 0.00018965859420826684, + "loss": 0.7721, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.387973868858638, + "learning_rate": 0.00018954349804767184, + "loss": 0.712, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.4860347187640322, + "learning_rate": 0.00018942780025869098, + "loss": 0.8645, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.44564759178431707, + "learning_rate": 0.00018931150161867916, + "loss": 0.8202, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.420660624361008, + "learning_rate": 0.00018919460290902826, + "loss": 0.7458, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.41015411316730255, + "learning_rate": 0.00018907710491516199, + "loss": 0.7232, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.42515818070700523, + "learning_rate": 0.0001889590084265304, + "loss": 0.8088, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.4610430638278789, + "learning_rate": 0.0001888403142366049, + "loss": 0.8274, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.42056918293114154, + "learning_rate": 0.0001887210231428727, + "loss": 0.777, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.44536383613524727, + "learning_rate": 0.00018860113594683148, + "loss": 0.8221, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.421032036109606, + "learning_rate": 0.0001884806534539841, + "loss": 0.8321, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.43101712396074243, + "learning_rate": 0.00018835957647383303, + "loss": 0.7817, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.38674873972167195, + "learning_rate": 0.0001882379058198751, + "loss": 0.7869, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3707518679142606, + "learning_rate": 0.00018811564230959588, + "loss": 0.702, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.3916112759808893, + "learning_rate": 0.00018799278676446423, + "loss": 0.7938, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.38609299121364954, + "learning_rate": 0.00018786934000992688, + "loss": 0.7322, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.435661825146532, + "learning_rate": 0.00018774530287540278, + "loss": 0.8045, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.49499626868178864, + "learning_rate": 0.00018762067619427746, + "loss": 0.8234, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.4893711424890475, + "learning_rate": 0.00018749546080389757, + "loss": 0.873, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.4450958116436791, + "learning_rate": 0.00018736965754556528, + "loss": 0.8649, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.45547998510701876, + "learning_rate": 0.00018724326726453244, + "loss": 0.8302, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4295009687377971, + "learning_rate": 0.00018711629080999504, + "loss": 0.7666, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.43034893049915784, + "learning_rate": 0.00018698872903508755, + "loss": 0.7891, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4114283725236567, + "learning_rate": 0.00018686058279687698, + "loss": 0.7864, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.4110886039969523, + "learning_rate": 0.0001867318529563574, + "loss": 0.7785, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.3855114181967641, + "learning_rate": 0.00018660254037844388, + "loss": 0.7328, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.4567790027176353, + "learning_rate": 0.00018647264593196688, + "loss": 0.8856, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.5619354317424428, + "learning_rate": 0.00018634217048966637, + "loss": 0.7871, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.42812378397821366, + "learning_rate": 0.00018621111492818585, + "loss": 0.8011, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4241679326027458, + "learning_rate": 0.0001860794801280666, + "loss": 0.7245, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.39538001527672545, + "learning_rate": 0.00018594726697374175, + "loss": 0.7696, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.42330872777641004, + "learning_rate": 0.0001858144763535302, + "loss": 0.8117, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.385442778809825, + "learning_rate": 0.0001856811091596308, + "loss": 0.7262, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.48319857321141446, + "learning_rate": 0.0001855471662881164, + "loss": 0.8435, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.42823877532031807, + "learning_rate": 0.00018541264863892754, + "loss": 0.7938, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.4218589476531892, + "learning_rate": 0.00018527755711586678, + "loss": 0.773, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.42011382915150025, + "learning_rate": 0.00018514189262659235, + "loss": 0.7718, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.45200703035714385, + "learning_rate": 0.00018500565608261214, + "loss": 0.8206, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.44780678084596004, + "learning_rate": 0.00018486884839927768, + "loss": 0.8711, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.3984521596182537, + "learning_rate": 0.00018473147049577774, + "loss": 0.7721, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.43193742325175283, + "learning_rate": 0.0001845935232951325, + "loss": 0.7781, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.37411697466957655, + "learning_rate": 0.00018445500772418697, + "loss": 0.7818, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.4607327939799755, + "learning_rate": 0.00018431592471360503, + "loss": 0.8509, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4039402676301833, + "learning_rate": 0.00018417627519786315, + "loss": 0.7639, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.4051051026908022, + "learning_rate": 0.000184036060115244, + "loss": 0.7958, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.4681079514119628, + "learning_rate": 0.00018389528040783012, + "loss": 0.778, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.5089045150757345, + "learning_rate": 0.00018375393702149787, + "loss": 0.8048, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.39712170271914216, + "learning_rate": 0.00018361203090591071, + "loss": 0.8004, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.4392432511684368, + "learning_rate": 0.00018346956301451304, + "loss": 0.8395, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4347253143263972, + "learning_rate": 0.00018332653430452376, + "loss": 0.8149, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.41443178774377587, + "learning_rate": 0.00018318294573692985, + "loss": 0.7329, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4650143976768978, + "learning_rate": 0.00018303879827647975, + "loss": 0.839, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.43911311807537096, + "learning_rate": 0.0001828940928916772, + "loss": 0.8067, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.39540305914208157, + "learning_rate": 0.00018274883055477436, + "loss": 0.7068, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.42966623588159436, + "learning_rate": 0.00018260301224176558, + "loss": 0.8177, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.43133822155256435, + "learning_rate": 0.00018245663893238075, + "loss": 0.7177, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.4244528334614769, + "learning_rate": 0.00018230971161007853, + "loss": 0.7657, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.47176055918419235, + "learning_rate": 0.00018216223126204007, + "loss": 0.8243, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.41788494818177235, + "learning_rate": 0.00018201419887916214, + "loss": 0.7464, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.40337884467661256, + "learning_rate": 0.00018186561545605054, + "loss": 0.7418, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.43116229829324304, + "learning_rate": 0.00018171648199101346, + "loss": 0.837, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4201249121122027, + "learning_rate": 0.00018156679948605467, + "loss": 0.7734, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.4276002086311205, + "learning_rate": 0.00018141656894686689, + "loss": 0.8, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.462778116990809, + "learning_rate": 0.00018126579138282503, + "loss": 0.798, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.3895954135717525, + "learning_rate": 0.00018111446780697929, + "loss": 0.7247, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.44693768901196756, + "learning_rate": 0.0001809625992360485, + "loss": 0.7611, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.5012266359481057, + "learning_rate": 0.00018081018669041324, + "loss": 0.7782, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.431580052077535, + "learning_rate": 0.00018065723119410884, + "loss": 0.8136, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.34580350194592, + "learning_rate": 0.00018050373377481878, + "loss": 0.7046, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.40521604941592043, + "learning_rate": 0.00018034969546386757, + "loss": 0.7822, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.463413863480668, + "learning_rate": 0.0001801951172962139, + "loss": 0.8068, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.40150935644843316, + "learning_rate": 0.0001800400003104436, + "loss": 0.8233, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.45689557364514, + "learning_rate": 0.0001798843455487629, + "loss": 0.823, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4670739142075554, + "learning_rate": 0.00017972815405699103, + "loss": 0.8149, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.5307781342567004, + "learning_rate": 0.00017957142688455362, + "loss": 0.804, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.4208180995221731, + "learning_rate": 0.00017941416508447536, + "loss": 0.7791, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.45014040176489684, + "learning_rate": 0.00017925636971337304, + "loss": 0.8117, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4596321645895528, + "learning_rate": 0.0001790980418314484, + "loss": 0.7931, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.4028349356623204, + "learning_rate": 0.00017893918250248104, + "loss": 0.7381, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.3917587239284584, + "learning_rate": 0.00017877979279382135, + "loss": 0.6936, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.3900838090279277, + "learning_rate": 0.00017861987377638312, + "loss": 0.7342, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.43758813519958567, + "learning_rate": 0.0001784594265246366, + "loss": 0.7689, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.40332860167731854, + "learning_rate": 0.0001782984521166011, + "loss": 0.7457, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.39487269978666895, + "learning_rate": 0.0001781369516338378, + "loss": 0.7212, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.4332277656743601, + "learning_rate": 0.00017797492616144256, + "loss": 0.8009, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.35652603119573323, + "learning_rate": 0.00017781237678803847, + "loss": 0.6794, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.4438369720997122, + "learning_rate": 0.00017764930460576866, + "loss": 0.8007, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.42018279839502726, + "learning_rate": 0.000177485710710289, + "loss": 0.758, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.4200937067695404, + "learning_rate": 0.00017732159620076053, + "loss": 0.7503, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4884774124325421, + "learning_rate": 0.00017715696217984235, + "loss": 0.8924, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.3908029573556648, + "learning_rate": 0.00017699180975368396, + "loss": 0.7262, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.41808155721721457, + "learning_rate": 0.00017682614003191807, + "loss": 0.7691, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.40250340781508953, + "learning_rate": 0.00017665995412765285, + "loss": 0.7377, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.44929706295556365, + "learning_rate": 0.00017649325315746478, + "loss": 0.8265, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.4159689392637273, + "learning_rate": 0.00017632603824139085, + "loss": 0.7394, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.43527641503799935, + "learning_rate": 0.0001761583105029213, + "loss": 0.8427, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.34219949975786723, + "learning_rate": 0.0001759900710689918, + "loss": 0.7182, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4254323711431976, + "learning_rate": 0.00017582132106997616, + "loss": 0.849, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.4673742997750265, + "learning_rate": 0.00017565206163967846, + "loss": 0.8315, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.3829077844147456, + "learning_rate": 0.00017548229391532572, + "loss": 0.7322, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.390914529242842, + "learning_rate": 0.00017531201903755994, + "loss": 0.7534, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.42361530007513093, + "learning_rate": 0.00017514123815043074, + "loss": 0.708, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.45358930689734117, + "learning_rate": 0.00017496995240138744, + "loss": 0.7985, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4215442147140462, + "learning_rate": 0.00017479816294127152, + "loss": 0.8093, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.4094160983237034, + "learning_rate": 0.00017462587092430875, + "loss": 0.7078, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.40893592359557696, + "learning_rate": 0.0001744530775081015, + "loss": 0.7011, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.4071121116863761, + "learning_rate": 0.00017427978385362112, + "loss": 0.755, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4357221168046931, + "learning_rate": 0.0001741059911251997, + "loss": 0.7251, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.395505603922931, + "learning_rate": 0.0001739317004905227, + "loss": 0.7379, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.37045560115272647, + "learning_rate": 0.000173756913120621, + "loss": 0.6287, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.3976537556203463, + "learning_rate": 0.00017358163018986282, + "loss": 0.7186, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.5056725192022081, + "learning_rate": 0.00017340585287594604, + "loss": 0.8429, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.3756217181501402, + "learning_rate": 0.00017322958235989016, + "loss": 0.736, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4074679927609601, + "learning_rate": 0.0001730528198260285, + "loss": 0.7919, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.4249231127747171, + "learning_rate": 0.00017287556646200018, + "loss": 0.7303, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.39000956775536444, + "learning_rate": 0.00017269782345874203, + "loss": 0.7254, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.3916990291806942, + "learning_rate": 0.00017251959201048083, + "loss": 0.7013, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.42591790880822694, + "learning_rate": 0.00017234087331472497, + "loss": 0.7502, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.37489223839291724, + "learning_rate": 0.00017216166857225674, + "loss": 0.7018, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4194676251178253, + "learning_rate": 0.00017198197898712404, + "loss": 0.7212, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.43101335303581556, + "learning_rate": 0.00017180180576663228, + "loss": 0.8161, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4387232065897843, + "learning_rate": 0.00017162115012133643, + "loss": 0.8008, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.41382439367804846, + "learning_rate": 0.00017144001326503273, + "loss": 0.804, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.41211004447198074, + "learning_rate": 0.00017125839641475072, + "loss": 0.7569, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.40008071317180616, + "learning_rate": 0.00017107630079074478, + "loss": 0.7677, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.5127153714759964, + "learning_rate": 0.00017089372761648616, + "loss": 0.7871, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.4490024504182968, + "learning_rate": 0.00017071067811865476, + "loss": 0.7269, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.43938571973084534, + "learning_rate": 0.00017052715352713075, + "loss": 0.7702, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.41014909069350464, + "learning_rate": 0.00017034315507498635, + "loss": 0.785, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.41426028858989083, + "learning_rate": 0.00017015868399847768, + "loss": 0.8286, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.4115488561422381, + "learning_rate": 0.00016997374153703625, + "loss": 0.8013, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4214337570754401, + "learning_rate": 0.00016978832893326074, + "loss": 0.8269, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.4043638403803839, + "learning_rate": 0.00016960244743290868, + "loss": 0.7818, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.6022236587314866, + "learning_rate": 0.00016941609828488807, + "loss": 0.7983, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.40289661644881036, + "learning_rate": 0.00016922928274124886, + "loss": 0.7353, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.37273367086224285, + "learning_rate": 0.0001690420020571747, + "loss": 0.6951, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.4682536065646327, + "learning_rate": 0.00016885425749097444, + "loss": 0.7607, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.44530767350503303, + "learning_rate": 0.0001686660503040737, + "loss": 0.8339, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.4192781089904373, + "learning_rate": 0.00016847738176100632, + "loss": 0.7627, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.424279947113004, + "learning_rate": 0.00016828825312940592, + "loss": 0.7607, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.3624871461837757, + "learning_rate": 0.0001680986656799975, + "loss": 0.6784, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4458628496104206, + "learning_rate": 0.0001679086206865886, + "loss": 0.8183, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.37260410669057686, + "learning_rate": 0.00016771811942606108, + "loss": 0.699, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.45846386329482036, + "learning_rate": 0.00016752716317836229, + "loss": 0.8798, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.3880417003199712, + "learning_rate": 0.00016733575322649657, + "loss": 0.6794, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.4084947786453557, + "learning_rate": 0.0001671438908565167, + "loss": 0.7111, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.40911708106332595, + "learning_rate": 0.00016695157735751513, + "loss": 0.7549, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4132999549821528, + "learning_rate": 0.00016675881402161536, + "loss": 0.791, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.49385166506659406, + "learning_rate": 0.0001665656021439633, + "loss": 0.8749, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.38919074019594835, + "learning_rate": 0.0001663719430227186, + "loss": 0.6928, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.43760925349144164, + "learning_rate": 0.00016617783795904565, + "loss": 0.7955, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.38914225629869686, + "learning_rate": 0.00016598328825710533, + "loss": 0.7366, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.36628467222820826, + "learning_rate": 0.00016578829522404583, + "loss": 0.6918, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.42253341688016677, + "learning_rate": 0.000165592860169994, + "loss": 0.8499, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.43265824860995267, + "learning_rate": 0.00016539698440804661, + "loss": 0.8011, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.45157281512211417, + "learning_rate": 0.00016520066925426144, + "loss": 0.8259, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.41067404773789473, + "learning_rate": 0.0001650039160276485, + "loss": 0.7546, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.414208097470773, + "learning_rate": 0.0001648067260501611, + "loss": 0.7532, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.4208006567151566, + "learning_rate": 0.0001646091006466871, + "loss": 0.7728, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.41022324623325745, + "learning_rate": 0.0001644110411450398, + "loss": 0.7424, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.4372777903897775, + "learning_rate": 0.00016421254887594917, + "loss": 0.7284, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4293241178806811, + "learning_rate": 0.00016401362517305296, + "loss": 0.8082, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.4703338824301085, + "learning_rate": 0.00016381427137288754, + "loss": 0.7858, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.3821269768908472, + "learning_rate": 0.00016361448881487914, + "loss": 0.6812, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.4639157162424115, + "learning_rate": 0.0001634142788413346, + "loss": 0.8313, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.4066065519034959, + "learning_rate": 0.00016321364279743266, + "loss": 0.7776, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.4069990387589458, + "learning_rate": 0.00016301258203121462, + "loss": 0.7526, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.44425750702697925, + "learning_rate": 0.0001628110978935756, + "loss": 0.7674, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.40915898668096295, + "learning_rate": 0.00016260919173825508, + "loss": 0.7172, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.46277505387879625, + "learning_rate": 0.00016240686492182804, + "loss": 0.7885, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.42489735407616697, + "learning_rate": 0.00016220411880369601, + "loss": 0.7767, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.3915573764346979, + "learning_rate": 0.00016200095474607753, + "loss": 0.7262, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.4694645980008425, + "learning_rate": 0.00016179737411399926, + "loss": 0.8272, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.42899293439515646, + "learning_rate": 0.00016159337827528685, + "loss": 0.804, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.3740814824219872, + "learning_rate": 0.00016138896860055555, + "loss": 0.7416, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.39837081609459923, + "learning_rate": 0.0001611841464632011, + "loss": 0.7494, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.4582353947458681, + "learning_rate": 0.00016097891323939062, + "loss": 0.8083, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4307640402194602, + "learning_rate": 0.0001607732703080532, + "loss": 0.7589, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.36653586683967887, + "learning_rate": 0.00016056721905087056, + "loss": 0.6716, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.544221337878332, + "learning_rate": 0.00016036076085226814, + "loss": 0.7744, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.4351542944159919, + "learning_rate": 0.00016015389709940538, + "loss": 0.7857, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.39901121045005666, + "learning_rate": 0.0001599466291821666, + "loss": 0.7286, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.4219620512236007, + "learning_rate": 0.0001597389584931517, + "loss": 0.7797, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.39815337070858814, + "learning_rate": 0.0001595308864276666, + "loss": 0.7049, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.3818173849977698, + "learning_rate": 0.0001593224143837142, + "loss": 0.7471, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.39122989783011713, + "learning_rate": 0.0001591135437619847, + "loss": 0.7306, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.3767324234403499, + "learning_rate": 0.00015890427596584617, + "loss": 0.7385, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.40191898660582864, + "learning_rate": 0.0001586946124013354, + "loss": 0.6948, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.435571201323962, + "learning_rate": 0.00015848455447714822, + "loss": 0.7739, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4523390503836961, + "learning_rate": 0.0001582741036046301, + "loss": 0.7937, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.4485313538011009, + "learning_rate": 0.00015806326119776663, + "loss": 0.7949, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.45596518575590006, + "learning_rate": 0.00015785202867317407, + "loss": 0.8077, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.42809020109568713, + "learning_rate": 0.00015764040745008988, + "loss": 0.7876, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.38023281273221554, + "learning_rate": 0.00015742839895036305, + "loss": 0.7229, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.41311752581332717, + "learning_rate": 0.00015721600459844468, + "loss": 0.6848, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.400008581777007, + "learning_rate": 0.00015700322582137827, + "loss": 0.7318, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.3811652194716379, + "learning_rate": 0.00015679006404879033, + "loss": 0.7159, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.35730391502884745, + "learning_rate": 0.0001565765207128805, + "loss": 0.6486, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.4324338486262784, + "learning_rate": 0.00015636259724841222, + "loss": 0.7947, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4545592323947514, + "learning_rate": 0.0001561482950927029, + "loss": 0.7782, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.42678151167414735, + "learning_rate": 0.00015593361568561428, + "loss": 0.8018, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.39316885399720736, + "learning_rate": 0.00015571856046954285, + "loss": 0.7533, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.40953597766510147, + "learning_rate": 0.0001555031308894101, + "loss": 0.7312, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.3628151835596889, + "learning_rate": 0.00015528732839265272, + "loss": 0.6925, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.3630247652160497, + "learning_rate": 0.0001550711544292131, + "loss": 0.6087, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.4076144904954131, + "learning_rate": 0.0001548546104515294, + "loss": 0.7285, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.3948107855238599, + "learning_rate": 0.00015463769791452574, + "loss": 0.7689, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.4045910835927847, + "learning_rate": 0.00015442041827560274, + "loss": 0.7174, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.42784470226134785, + "learning_rate": 0.00015420277299462736, + "loss": 0.782, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.42261162570750493, + "learning_rate": 0.00015398476353392323, + "loss": 0.7867, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.37034688847703967, + "learning_rate": 0.00015376639135826107, + "loss": 0.6975, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.37709320180579986, + "learning_rate": 0.00015354765793484834, + "loss": 0.7107, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.39874988129787453, + "learning_rate": 0.00015332856473331978, + "loss": 0.7501, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.38523765832533496, + "learning_rate": 0.00015310911322572753, + "loss": 0.7129, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.39536564455432827, + "learning_rate": 0.00015288930488653094, + "loss": 0.7266, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.3463850559775383, + "learning_rate": 0.000152669141192587, + "loss": 0.6834, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.46367798778196645, + "learning_rate": 0.0001524486236231402, + "loss": 0.7669, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.38663458908064, + "learning_rate": 0.00015222775365981273, + "loss": 0.7596, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.41586262767094645, + "learning_rate": 0.00015200653278659432, + "loss": 0.7199, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.46103511895719435, + "learning_rate": 0.00015178496248983254, + "loss": 0.8416, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.4184428407337638, + "learning_rate": 0.00015156304425822267, + "loss": 0.7527, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.49855699805314946, + "learning_rate": 0.00015134077958279765, + "loss": 0.8474, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.42845886618076534, + "learning_rate": 0.00015111816995691809, + "loss": 0.8049, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4530175233083066, + "learning_rate": 0.00015089521687626243, + "loss": 0.7144, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.41075480954547927, + "learning_rate": 0.00015067192183881658, + "loss": 0.7384, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.40708334713709043, + "learning_rate": 0.000150448286344864, + "loss": 0.7652, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.5818509229326395, + "learning_rate": 0.00015022431189697568, + "loss": 0.8631, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.42118137569346126, + "learning_rate": 0.00015000000000000001, + "loss": 0.7961, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.41010451650558516, + "learning_rate": 0.0001497753521610526, + "loss": 0.7229, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.423125162993934, + "learning_rate": 0.00014955036988950618, + "loss": 0.7648, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.40099758305903915, + "learning_rate": 0.00014932505469698052, + "loss": 0.7493, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.40085301663186823, + "learning_rate": 0.00014909940809733222, + "loss": 0.7125, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.3510211250094017, + "learning_rate": 0.0001488734316066446, + "loss": 0.6731, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.37050479877114534, + "learning_rate": 0.00014864712674321734, + "loss": 0.7004, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.4333091776481077, + "learning_rate": 0.0001484204950275565, + "loss": 0.7667, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.4226511054807109, + "learning_rate": 0.00014819353798236427, + "loss": 0.7829, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.4081592062701581, + "learning_rate": 0.00014796625713252848, + "loss": 0.7048, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.423554063227864, + "learning_rate": 0.00014773865400511272, + "loss": 0.7994, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.3743142771417236, + "learning_rate": 0.00014751073012934587, + "loss": 0.6988, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.44351252150762505, + "learning_rate": 0.00014728248703661182, + "loss": 0.8442, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.44646063750669057, + "learning_rate": 0.0001470539262604393, + "loss": 0.7579, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3723488079722956, + "learning_rate": 0.00014682504933649144, + "loss": 0.7201, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.4129286246507839, + "learning_rate": 0.00014659585780255556, + "loss": 0.7467, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.43773886229163683, + "learning_rate": 0.00014636635319853275, + "loss": 0.7748, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.412176687132668, + "learning_rate": 0.0001461365370664276, + "loss": 0.7605, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.41105016474383405, + "learning_rate": 0.00014590641095033787, + "loss": 0.7548, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.3859611255262852, + "learning_rate": 0.00014567597639644387, + "loss": 0.7217, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3970615939573622, + "learning_rate": 0.00014544523495299842, + "loss": 0.76, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.4677850684979814, + "learning_rate": 0.00014521418817031628, + "loss": 0.7227, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.41074224966622186, + "learning_rate": 0.0001449828376007636, + "loss": 0.7136, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.4252016612544697, + "learning_rate": 0.00014475118479874774, + "loss": 0.7487, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.37404689361923105, + "learning_rate": 0.0001445192313207067, + "loss": 0.7631, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.4090939135398797, + "learning_rate": 0.0001442869787250987, + "loss": 0.7462, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4369664465142497, + "learning_rate": 0.0001440544285723915, + "loss": 0.7877, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.40761629202712885, + "learning_rate": 0.00014382158242505234, + "loss": 0.6905, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.41875378199551183, + "learning_rate": 0.00014358844184753712, + "loss": 0.7472, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.8358439241813403, + "learning_rate": 0.00014335500840627986, + "loss": 0.7236, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.46388968370700895, + "learning_rate": 0.00014312128366968243, + "loss": 0.7863, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.39835167876802036, + "learning_rate": 0.0001428872692081038, + "loss": 0.7549, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.38420987796805284, + "learning_rate": 0.00014265296659384956, + "loss": 0.6909, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.4463675089401714, + "learning_rate": 0.00014241837740116132, + "loss": 0.7249, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3659560409886909, + "learning_rate": 0.00014218350320620624, + "loss": 0.6831, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.41133170803453634, + "learning_rate": 0.00014194834558706632, + "loss": 0.7541, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.42236590760206644, + "learning_rate": 0.0001417129061237278, + "loss": 0.7408, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.3837860015366334, + "learning_rate": 0.0001414771863980707, + "loss": 0.7473, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.5160320691639759, + "learning_rate": 0.00014124118799385796, + "loss": 0.8562, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.3849480248408463, + "learning_rate": 0.00014100491249672498, + "loss": 0.7148, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.3941590303338292, + "learning_rate": 0.00014076836149416887, + "loss": 0.7586, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.38721973528747744, + "learning_rate": 0.0001405315365755379, + "loss": 0.7846, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.40064782545917166, + "learning_rate": 0.0001402944393320206, + "loss": 0.7568, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.39338870914971513, + "learning_rate": 0.00014005707135663527, + "loss": 0.7185, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.39029507130944646, + "learning_rate": 0.00013981943424421932, + "loss": 0.6981, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.40867482731944854, + "learning_rate": 0.00013958152959141825, + "loss": 0.6855, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4382076380381577, + "learning_rate": 0.00013934335899667527, + "loss": 0.75, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.4270498186237226, + "learning_rate": 0.00013910492406022033, + "loss": 0.7405, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.45075032116777786, + "learning_rate": 0.00013886622638405952, + "loss": 0.6953, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.38898341483309623, + "learning_rate": 0.0001386272675719642, + "loss": 0.6565, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4033537483239289, + "learning_rate": 0.00013838804922946027, + "loss": 0.7429, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.42999158597516535, + "learning_rate": 0.00013814857296381728, + "loss": 0.729, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.38974476676785047, + "learning_rate": 0.00013790884038403795, + "loss": 0.6573, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.3961133656085115, + "learning_rate": 0.00013766885310084688, + "loss": 0.6803, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3823908001357277, + "learning_rate": 0.00013742861272668012, + "loss": 0.685, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.49877993731022596, + "learning_rate": 0.00013718812087567414, + "loss": 0.8469, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.42866883973871944, + "learning_rate": 0.00013694737916365517, + "loss": 0.7263, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.4012134571717772, + "learning_rate": 0.000136706389208128, + "loss": 0.7376, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.40274292876935464, + "learning_rate": 0.00013646515262826552, + "loss": 0.7137, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.3890943884589063, + "learning_rate": 0.00013622367104489756, + "loss": 0.7607, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.41888096021420373, + "learning_rate": 0.0001359819460805001, + "loss": 0.8449, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.39795135596529974, + "learning_rate": 0.0001357399793591844, + "loss": 0.7654, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4176974400501837, + "learning_rate": 0.0001354977725066859, + "loss": 0.7507, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.4290373283668271, + "learning_rate": 0.00013525532715035366, + "loss": 0.7141, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.3788056047338438, + "learning_rate": 0.00013501264491913906, + "loss": 0.6782, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.4030490255452256, + "learning_rate": 0.00013476972744358507, + "loss": 0.7107, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.8704552300643414, + "learning_rate": 0.0001345265763558152, + "loss": 0.7185, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.4022264828473569, + "learning_rate": 0.00013428319328952253, + "loss": 0.7202, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.4229430400668877, + "learning_rate": 0.00013403957987995882, + "loss": 0.7891, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.4231583162337773, + "learning_rate": 0.0001337957377639235, + "loss": 0.6874, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.43153948757655713, + "learning_rate": 0.0001335516685797525, + "loss": 0.6786, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.4198059022745376, + "learning_rate": 0.0001333073739673076, + "loss": 0.7531, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3811705461151831, + "learning_rate": 0.00013306285556796495, + "loss": 0.6858, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.4190940859985535, + "learning_rate": 0.0001328181150246045, + "loss": 0.7143, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4536687433193188, + "learning_rate": 0.00013257315398159864, + "loss": 0.8082, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.41071086963109465, + "learning_rate": 0.00013232797408480127, + "loss": 0.7351, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.47882365797373727, + "learning_rate": 0.00013208257698153677, + "loss": 0.7702, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.38295352477242467, + "learning_rate": 0.00013183696432058888, + "loss": 0.6919, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.42881355586122355, + "learning_rate": 0.00013159113775218964, + "loss": 0.7571, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.38449068440791867, + "learning_rate": 0.00013134509892800822, + "loss": 0.6772, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.5086204503020627, + "learning_rate": 0.00013109884950114007, + "loss": 0.8256, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.4301701690919157, + "learning_rate": 0.00013085239112609547, + "loss": 0.8075, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4058292645531843, + "learning_rate": 0.00013060572545878875, + "loss": 0.7181, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.3537948967625652, + "learning_rate": 0.00013035885415652685, + "loss": 0.674, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.3804409614599797, + "learning_rate": 0.00013011177887799845, + "loss": 0.7096, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.3973976042378748, + "learning_rate": 0.00012986450128326266, + "loss": 0.6934, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4050141353906074, + "learning_rate": 0.00012961702303373795, + "loss": 0.757, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.3986132692795599, + "learning_rate": 0.00012936934579219094, + "loss": 0.6967, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.40584716553831934, + "learning_rate": 0.00012912147122272523, + "loss": 0.7235, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.40044297473824103, + "learning_rate": 0.00012887340099077024, + "loss": 0.758, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3848338429889167, + "learning_rate": 0.00012862513676307008, + "loss": 0.7297, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.42139677544301846, + "learning_rate": 0.0001283766802076722, + "loss": 0.7165, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.36554253586630076, + "learning_rate": 0.00012812803299391628, + "loss": 0.6707, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.36826074350319077, + "learning_rate": 0.00012787919679242306, + "loss": 0.6653, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4373697897927393, + "learning_rate": 0.00012763017327508305, + "loss": 0.7336, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.4201047732025366, + "learning_rate": 0.00012738096411504522, + "loss": 0.7428, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.38396768744068105, + "learning_rate": 0.0001271315709867059, + "loss": 0.6727, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.3730930808471466, + "learning_rate": 0.00012688199556569753, + "loss": 0.6945, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.42098927252534707, + "learning_rate": 0.00012663223952887723, + "loss": 0.744, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.3951808184820184, + "learning_rate": 0.0001263823045543158, + "loss": 0.7275, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.41872967335722294, + "learning_rate": 0.00012613219232128608, + "loss": 0.7741, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.4282331565942988, + "learning_rate": 0.00012588190451025207, + "loss": 0.6753, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4366654761596964, + "learning_rate": 0.00012563144280285741, + "loss": 0.7224, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.4502231199852326, + "learning_rate": 0.00012538080888191408, + "loss": 0.869, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.35101016716715344, + "learning_rate": 0.00012513000443139112, + "loss": 0.6528, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.46211253437428196, + "learning_rate": 0.00012487903113640337, + "loss": 0.7658, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4179053401194437, + "learning_rate": 0.00012462789068320017, + "loss": 0.7627, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.42304596138171013, + "learning_rate": 0.00012437658475915377, + "loss": 0.7285, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.38481247196526547, + "learning_rate": 0.00012412511505274844, + "loss": 0.724, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.43738464403250726, + "learning_rate": 0.00012387348325356874, + "loss": 0.781, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.40584209275053346, + "learning_rate": 0.00012362169105228826, + "loss": 0.7206, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.3855582382310194, + "learning_rate": 0.00012336974014065844, + "loss": 0.6935, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.3673208093686153, + "learning_rate": 0.000123117632211497, + "loss": 0.6593, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.41615163514555564, + "learning_rate": 0.00012286536895867654, + "loss": 0.674, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4624231812328161, + "learning_rate": 0.00012261295207711346, + "loss": 0.7679, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.39163698830776683, + "learning_rate": 0.00012236038326275626, + "loss": 0.7186, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3696327093662802, + "learning_rate": 0.0001221076642125742, + "loss": 0.7175, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.3721727885224274, + "learning_rate": 0.00012185479662454595, + "loss": 0.6454, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.4738677251372763, + "learning_rate": 0.00012160178219764837, + "loss": 0.7833, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.43173794439134333, + "learning_rate": 0.00012134862263184467, + "loss": 0.7576, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.45922038884476846, + "learning_rate": 0.00012109531962807332, + "loss": 0.8029, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.4646870984818398, + "learning_rate": 0.00012084187488823657, + "loss": 0.7145, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3777597816259183, + "learning_rate": 0.00012058829011518896, + "loss": 0.6997, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.36089691320765555, + "learning_rate": 0.00012033456701272576, + "loss": 0.649, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3973909913995549, + "learning_rate": 0.00012008070728557186, + "loss": 0.7399, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.44075193787463834, + "learning_rate": 0.00011982671263936995, + "loss": 0.6699, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.38675999472414196, + "learning_rate": 0.00011957258478066931, + "loss": 0.6949, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.3825013397690802, + "learning_rate": 0.00011931832541691418, + "loss": 0.7111, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.400911462526282, + "learning_rate": 0.00011906393625643244, + "loss": 0.7051, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.4139699698844406, + "learning_rate": 0.00011880941900842397, + "loss": 0.7686, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.43175536022693406, + "learning_rate": 0.00011855477538294935, + "loss": 0.7591, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.4230263494240272, + "learning_rate": 0.00011830000709091815, + "loss": 0.6988, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.3737762850727218, + "learning_rate": 0.00011804511584407763, + "loss": 0.6842, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.38964413678090243, + "learning_rate": 0.0001177901033550012, + "loss": 0.743, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.37543935689794156, + "learning_rate": 0.00011753497133707679, + "loss": 0.7152, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.43321509500863586, + "learning_rate": 0.00011727972150449544, + "loss": 0.697, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4303903400259315, + "learning_rate": 0.00011702435557223987, + "loss": 0.6405, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.3760923694217793, + "learning_rate": 0.00011676887525607271, + "loss": 0.7234, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.3736237674593301, + "learning_rate": 0.00011651328227252517, + "loss": 0.6894, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.394451489947489, + "learning_rate": 0.00011625757833888551, + "loss": 0.663, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4647786289303067, + "learning_rate": 0.00011600176517318741, + "loss": 0.7257, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.3922140990005979, + "learning_rate": 0.0001157458444941984, + "loss": 0.7071, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4290632195123379, + "learning_rate": 0.00011548981802140848, + "loss": 0.7473, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.4060539612859277, + "learning_rate": 0.00011523368747501839, + "loss": 0.6824, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4227461339262886, + "learning_rate": 0.00011497745457592816, + "loss": 0.7516, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.4431357699775886, + "learning_rate": 0.00011472112104572547, + "loss": 0.7619, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4093628115550298, + "learning_rate": 0.00011446468860667421, + "loss": 0.7209, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.37441216848578235, + "learning_rate": 0.0001142081589817027, + "loss": 0.6471, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.45421403580973224, + "learning_rate": 0.00011395153389439233, + "loss": 0.7773, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.37713032856522577, + "learning_rate": 0.00011369481506896582, + "loss": 0.6931, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4300256782238676, + "learning_rate": 0.00011343800423027582, + "loss": 0.7121, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.4264632541041666, + "learning_rate": 0.00011318110310379301, + "loss": 0.7352, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.45548600133711703, + "learning_rate": 0.0001129241134155949, + "loss": 0.7462, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.3690942812425331, + "learning_rate": 0.00011266703689235394, + "loss": 0.7231, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3999966576574989, + "learning_rate": 0.00011240987526132594, + "loss": 0.7114, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.38948232123577514, + "learning_rate": 0.00011215263025033869, + "loss": 0.7409, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.481230685207155, + "learning_rate": 0.00011189530358778005, + "loss": 0.6837, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.4503212121059997, + "learning_rate": 0.00011163789700258655, + "loss": 0.7794, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.36614189932253133, + "learning_rate": 0.00011138041222423177, + "loss": 0.6793, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.46491602621791206, + "learning_rate": 0.00011112285098271451, + "loss": 0.7656, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.39604886431863673, + "learning_rate": 0.00011086521500854745, + "loss": 0.737, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.42064628624762074, + "learning_rate": 0.00011060750603274535, + "loss": 0.7578, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4254702982415326, + "learning_rate": 0.00011034972578681338, + "loss": 0.7621, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.40597402249866665, + "learning_rate": 0.00011009187600273566, + "loss": 0.7541, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.3608489448693908, + "learning_rate": 0.00010983395841296348, + "loss": 0.7173, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.4091979039360563, + "learning_rate": 0.00010957597475040373, + "loss": 0.7701, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3460555035826413, + "learning_rate": 0.00010931792674840718, + "loss": 0.629, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.42243337211845133, + "learning_rate": 0.00010905981614075693, + "loss": 0.7449, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.39767090591192195, + "learning_rate": 0.00010880164466165674, + "loss": 0.7283, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.45863817731004536, + "learning_rate": 0.00010854341404571928, + "loss": 0.7254, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.35430390957391245, + "learning_rate": 0.00010828512602795462, + "loss": 0.6686, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.367535141832263, + "learning_rate": 0.00010802678234375851, + "loss": 0.6957, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4299938590504528, + "learning_rate": 0.00010776838472890065, + "loss": 0.7537, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.37255287859587105, + "learning_rate": 0.0001075099349195131, + "loss": 0.6413, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4205287420555947, + "learning_rate": 0.00010725143465207867, + "loss": 0.7039, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.36889169736730626, + "learning_rate": 0.00010699288566341914, + "loss": 0.6985, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.4066798466103115, + "learning_rate": 0.00010673428969068364, + "loss": 0.6832, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.38308578601888854, + "learning_rate": 0.000106475648471337, + "loss": 0.7013, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.36880126553715525, + "learning_rate": 0.00010621696374314807, + "loss": 0.6992, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.44451092597637976, + "learning_rate": 0.00010595823724417795, + "loss": 0.7202, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3716178286616881, + "learning_rate": 0.00010569947071276847, + "loss": 0.6519, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.4426458653806619, + "learning_rate": 0.00010544066588753044, + "loss": 0.705, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.37895263210753366, + "learning_rate": 0.00010518182450733186, + "loss": 0.7287, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.3999977558858813, + "learning_rate": 0.00010492294831128641, + "loss": 0.6768, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.3956912810776047, + "learning_rate": 0.00010466403903874176, + "loss": 0.7209, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.4424455275777821, + "learning_rate": 0.00010440509842926767, + "loss": 0.7579, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4673803319588536, + "learning_rate": 0.00010414612822264455, + "loss": 0.7861, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.39630784564816757, + "learning_rate": 0.00010388713015885161, + "loss": 0.725, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.3804278965686192, + "learning_rate": 0.00010362810597805526, + "loss": 0.7224, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.31830914083965456, + "learning_rate": 0.00010336905742059742, + "loss": 0.6331, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.42548505965409894, + "learning_rate": 0.0001031099862269837, + "loss": 0.7231, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.3585288604599067, + "learning_rate": 0.0001028508941378719, + "loss": 0.6857, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.33174657718323153, + "learning_rate": 0.00010259178289406011, + "loss": 0.6791, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.4060636357776109, + "learning_rate": 0.00010233265423647523, + "loss": 0.7667, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3905132002051727, + "learning_rate": 0.00010207350990616107, + "loss": 0.7016, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.4109437568865212, + "learning_rate": 0.00010181435164426676, + "loss": 0.7591, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.40342604974072854, + "learning_rate": 0.0001015551811920351, + "loss": 0.6846, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.37153699490697906, + "learning_rate": 0.00010129600029079072, + "loss": 0.6963, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.4664907888794908, + "learning_rate": 0.00010103681068192845, + "loss": 0.781, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.3719035890782228, + "learning_rate": 0.00010077761410690172, + "loss": 0.6836, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.3543157408407524, + "learning_rate": 0.00010051841230721065, + "loss": 0.6417, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.3831151097875646, + "learning_rate": 0.00010025920702439051, + "loss": 0.694, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.38888890747319, + "learning_rate": 0.0001, + "loss": 0.7479, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.3945568364244657, + "learning_rate": 9.97407929756095e-05, + "loss": 0.7192, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.35244019029902424, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6542, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.377560958078493, + "learning_rate": 9.92223858930983e-05, + "loss": 0.6846, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.37257558644780747, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6823, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.38257422868362584, + "learning_rate": 9.870399970920932e-05, + "loss": 0.6866, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.4855592521266635, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7963, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.40036227247467976, + "learning_rate": 9.818564835573323e-05, + "loss": 0.6637, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.32867665206140373, + "learning_rate": 9.792649009383899e-05, + "loss": 0.584, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.4198925818656027, + "learning_rate": 9.766734576352478e-05, + "loss": 0.7955, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.3972556335268183, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6863, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.3887101864579285, + "learning_rate": 9.714910586212816e-05, + "loss": 0.6686, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3781477995356685, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7068, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.3901639415685532, + "learning_rate": 9.663094257940258e-05, + "loss": 0.7113, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.3869702546950535, + "learning_rate": 9.637189402194476e-05, + "loss": 0.718, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.34995862952758633, + "learning_rate": 9.611286984114841e-05, + "loss": 0.6668, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.39437970951145673, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6747, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.41130325981029686, + "learning_rate": 9.559490157073236e-05, + "loss": 0.7083, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3741628506410771, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6863, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.39128973461786043, + "learning_rate": 9.507705168871358e-05, + "loss": 0.7628, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.36977609063465494, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6734, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.451538422818693, + "learning_rate": 9.455933411246958e-05, + "loss": 0.7499, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3712987515739473, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6851, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.42219531239486613, + "learning_rate": 9.404176275582208e-05, + "loss": 0.7346, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.43330289478955075, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6971, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.4492720289986999, + "learning_rate": 9.352435152866298e-05, + "loss": 0.7711, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.4931802042202946, + "learning_rate": 9.326571030931637e-05, + "loss": 0.8143, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.4411906793227389, + "learning_rate": 9.300711433658087e-05, + "loss": 0.7267, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4670472290047497, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7854, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.42677123995377886, + "learning_rate": 9.249006508048694e-05, + "loss": 0.7881, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.393010703318978, + "learning_rate": 9.223161527109937e-05, + "loss": 0.707, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.42520822204478687, + "learning_rate": 9.197321765624152e-05, + "loss": 0.6535, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3829198148572694, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6402, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.42465039688004347, + "learning_rate": 9.145658595428074e-05, + "loss": 0.7564, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3697197107568034, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7341, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.4356568160517684, + "learning_rate": 9.09401838592431e-05, + "loss": 0.6909, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.3887868918363411, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6851, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.41502794218844097, + "learning_rate": 9.04240252495963e-05, + "loss": 0.7092, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3508424005258134, + "learning_rate": 9.016604158703654e-05, + "loss": 0.6415, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.3906391535054069, + "learning_rate": 8.990812399726435e-05, + "loss": 0.7433, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.42460964802698714, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6994, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.36260277367149235, + "learning_rate": 8.939249396725467e-05, + "loss": 0.6549, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4021897399655954, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6354, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.41526985094243396, + "learning_rate": 8.887714901728551e-05, + "loss": 0.7671, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4146212411057442, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7431, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.37180097014227725, + "learning_rate": 8.836210299741346e-05, + "loss": 0.6356, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.36251068211454873, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6456, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.3586531608730886, + "learning_rate": 8.784736974966135e-05, + "loss": 0.6476, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4603340281684909, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6984, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.3870379005898693, + "learning_rate": 8.733296310764611e-05, + "loss": 0.7093, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.4027472629038024, + "learning_rate": 8.707588658440511e-05, + "loss": 0.71, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.3528191478928965, + "learning_rate": 8.6818896896207e-05, + "loss": 0.6294, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.33825869458909047, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6298, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.36037288534137707, + "learning_rate": 8.63051849310342e-05, + "loss": 0.6626, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.4889828579133769, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7661, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.3615120502652659, + "learning_rate": 8.579184101829734e-05, + "loss": 0.659, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.3819622314915852, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6899, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.3625524164766474, + "learning_rate": 8.527887895427454e-05, + "loss": 0.7175, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.37825883451692044, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6772, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.38042278841918303, + "learning_rate": 8.476631252498162e-05, + "loss": 0.6986, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.35804466110483835, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6795, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.3281736917944397, + "learning_rate": 8.425415550580162e-05, + "loss": 0.6371, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.36691010384976763, + "learning_rate": 8.399823482681262e-05, + "loss": 0.7083, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.37654825968461336, + "learning_rate": 8.374242166111448e-05, + "loss": 0.729, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.483695314586318, + "learning_rate": 8.348671772747487e-05, + "loss": 0.8131, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.3978851319398291, + "learning_rate": 8.323112474392731e-05, + "loss": 0.7137, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 1.0701915284130301, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6879, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.40766989240820967, + "learning_rate": 8.272027849550457e-05, + "loss": 0.7013, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.42485817360229233, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7917, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.4529163662639259, + "learning_rate": 8.220989664499878e-05, + "loss": 0.7932, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3757526686525042, + "learning_rate": 8.195488415592238e-05, + "loss": 0.703, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.4157260966214999, + "learning_rate": 8.169999290908188e-05, + "loss": 0.7377, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3684872875873162, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6747, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.4352203721253499, + "learning_rate": 8.119058099157604e-05, + "loss": 0.6921, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.3688596385164379, + "learning_rate": 8.093606374356759e-05, + "loss": 0.675, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.38601442265441466, + "learning_rate": 8.068167458308582e-05, + "loss": 0.7195, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.3688057036357684, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6881, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.39165481815069025, + "learning_rate": 8.017328736063006e-05, + "loss": 0.6375, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.3605859209790284, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6295, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.41339282016265194, + "learning_rate": 7.966543298727425e-05, + "loss": 0.7261, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3720958703979262, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6324, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.38369394721037564, + "learning_rate": 7.915812511176347e-05, + "loss": 0.6528, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.39487035674006327, + "learning_rate": 7.89046803719267e-05, + "loss": 0.644, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.364053557875168, + "learning_rate": 7.865137736815535e-05, + "loss": 0.655, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4091073164543154, + "learning_rate": 7.839821780235168e-05, + "loss": 0.7148, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.3860449431917968, + "learning_rate": 7.814520337545406e-05, + "loss": 0.7492, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.4014892334379826, + "learning_rate": 7.789233578742582e-05, + "loss": 0.684, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.4258334466602001, + "learning_rate": 7.763961673724379e-05, + "loss": 0.7487, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3804284937666935, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6785, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.3680495957717326, + "learning_rate": 7.713463104132345e-05, + "loss": 0.6839, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.35916091012297474, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6831, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.4853055134210838, + "learning_rate": 7.663025985934158e-05, + "loss": 0.7819, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4006664858164372, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6664, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.38752120698758835, + "learning_rate": 7.61265167464313e-05, + "loss": 0.6748, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3593820576140191, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6377, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.3659247324494101, + "learning_rate": 7.562341524084623e-05, + "loss": 0.6446, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.37344276743184085, + "learning_rate": 7.537210931679987e-05, + "loss": 0.677, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.383307924485725, + "learning_rate": 7.512096886359664e-05, + "loss": 0.6778, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3903104380329308, + "learning_rate": 7.48699955686089e-05, + "loss": 0.7101, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.34736833042562243, + "learning_rate": 7.461919111808595e-05, + "loss": 0.6322, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.39453909643963014, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6983, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.40376938298853243, + "learning_rate": 7.411809548974792e-05, + "loss": 0.6717, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.42368985381238794, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6669, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.4236510598985175, + "learning_rate": 7.361769544568425e-05, + "loss": 0.6447, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.41339350765198635, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6922, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.7218375320143412, + "learning_rate": 7.311800443430251e-05, + "loss": 0.7486, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.42855190255193215, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7084, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.3853592981630134, + "learning_rate": 7.26190358849548e-05, + "loss": 0.741, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.360165680338082, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6452, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.38043309835102074, + "learning_rate": 7.212080320757695e-05, + "loss": 0.6614, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4112894815413764, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7781, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.34908945812710956, + "learning_rate": 7.162331979232783e-05, + "loss": 0.616, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.4170878218052713, + "learning_rate": 7.137486323692995e-05, + "loss": 0.7444, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.36594322008531244, + "learning_rate": 7.112659900922976e-05, + "loss": 0.6887, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3391957470731677, + "learning_rate": 7.087852877727481e-05, + "loss": 0.5661, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.374998163890273, + "learning_rate": 7.06306542078091e-05, + "loss": 0.6748, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.5062788654494025, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7409, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.4033029379370935, + "learning_rate": 7.013549871673736e-05, + "loss": 0.7613, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.4816638722246713, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7674, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.4911048448099762, + "learning_rate": 6.964114584347316e-05, + "loss": 0.7677, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.35508407796578995, + "learning_rate": 6.939427454121128e-05, + "loss": 0.5686, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.4271338131050412, + "learning_rate": 6.914760887390452e-05, + "loss": 0.7709, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.3829874443837163, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6703, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.3772201380222238, + "learning_rate": 6.865490107199181e-05, + "loss": 0.6416, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.42957061904173477, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6785, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.4351015238354584, + "learning_rate": 6.816303567941112e-05, + "loss": 0.717, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.44387870196411, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7456, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.38345840918195717, + "learning_rate": 6.767202591519875e-05, + "loss": 0.6366, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.40318390774851576, + "learning_rate": 6.742684601840141e-05, + "loss": 0.6941, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.39216006216230787, + "learning_rate": 6.718188497539554e-05, + "loss": 0.6587, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4257985590229207, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7713, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.3931310762318468, + "learning_rate": 6.669262603269246e-05, + "loss": 0.6648, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3642652770131762, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6814, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.364177253154222, + "learning_rate": 6.620426223607654e-05, + "loss": 0.6627, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.3523087379696415, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6288, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.4059771452595751, + "learning_rate": 6.571680671047749e-05, + "loss": 0.6667, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.482718976349731, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7562, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.41578835122028934, + "learning_rate": 6.523027255641493e-05, + "loss": 0.7022, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.36922905393572486, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6519, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.3762926224218096, + "learning_rate": 6.474467284964634e-05, + "loss": 0.7232, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3978686265348035, + "learning_rate": 6.450222749331414e-05, + "loss": 0.7099, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.32496498392338546, + "learning_rate": 6.426002064081565e-05, + "loss": 0.5562, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.42086421906589716, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6752, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.3656906027819285, + "learning_rate": 6.377632895510248e-05, + "loss": 0.7032, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4161670937660567, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7525, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.37830212145755643, + "learning_rate": 6.329361079187199e-05, + "loss": 0.7098, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.41658258367628426, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7551, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.36045955475606195, + "learning_rate": 6.281187912432587e-05, + "loss": 0.66, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.37412315585512523, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7219, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.3940531238984509, + "learning_rate": 6.233114689915316e-05, + "loss": 0.6799, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.42150488983322487, + "learning_rate": 6.209115961596208e-05, + "loss": 0.7225, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.4299804274671972, + "learning_rate": 6.18514270361827e-05, + "loss": 0.7594, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.37550706071176354, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6751, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.35985151186544345, + "learning_rate": 6.13727324280358e-05, + "loss": 0.6515, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4831717492898829, + "learning_rate": 6.113377361594049e-05, + "loss": 0.8192, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.40579059753134555, + "learning_rate": 6.08950759397797e-05, + "loss": 0.7253, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.3937601854305587, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7437, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.338102934648602, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.6423, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.38581607564898074, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6956, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.4113944032955229, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.6973, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4095900672318419, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7466, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.4292823646665635, + "learning_rate": 5.946846342446214e-05, + "loss": 0.7328, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.37476926363657725, + "learning_rate": 5.923163850583113e-05, + "loss": 0.7136, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.4440561071976436, + "learning_rate": 5.899508750327501e-05, + "loss": 0.6965, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.34366750397941753, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6471, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.4103654288184694, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.7292, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.3362083965809128, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6451, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.4146917862331588, + "learning_rate": 5.80516544129337e-05, + "loss": 0.7293, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.5184762847393413, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6954, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.3669542021968653, + "learning_rate": 5.758162259883867e-05, + "loss": 0.6601, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3851300822559215, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7422, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.3600485993160188, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.7124, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3741263093251435, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6974, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.40506980225745537, + "learning_rate": 5.664499159372017e-05, + "loss": 0.7157, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.37142754860523913, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.661, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.34868864855170956, + "learning_rate": 5.617841757494762e-05, + "loss": 0.6331, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.3844359901312866, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6706, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.3679428435598268, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.6432, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.3609800351229282, + "learning_rate": 5.54807686792933e-05, + "loss": 0.644, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.40857060655903316, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6928, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.34890882720035793, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6659, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.34646708731898007, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.6275, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.38567908108552645, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7153, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.4305873982759078, + "learning_rate": 5.432402360355615e-05, + "loss": 0.7274, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.43135102883365606, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7578, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.4047217454206257, + "learning_rate": 5.386346293357242e-05, + "loss": 0.6523, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.38403048246137017, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6723, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.4149002869362283, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.7515, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4718520933027861, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6489, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.38813378130757714, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.658, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3939601361015646, + "learning_rate": 5.271751296338823e-05, + "loss": 0.658, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.3881313888834844, + "learning_rate": 5.248926987065417e-05, + "loss": 0.7287, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4301470833777586, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7421, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.4287457170994794, + "learning_rate": 5.203374286747158e-05, + "loss": 0.6678, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.3813750880666755, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6476, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.4201512651843253, + "learning_rate": 5.15795049724435e-05, + "loss": 0.7701, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.40329968841178876, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6548, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.435542688212676, + "learning_rate": 5.112656839335543e-05, + "loss": 0.7366, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4166070391500017, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7087, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.3790452016246712, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.6651, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4087162996368245, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.7629, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.45517039116161556, + "learning_rate": 5.022464783894744e-05, + "loss": 0.6626, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.3907054046676461, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6617, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.4112333855403147, + "learning_rate": 4.977568810302432e-05, + "loss": 0.7102, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.39196525023148127, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6492, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.39241913793819494, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.6458, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.36655779865171095, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6473, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.3695327576372595, + "learning_rate": 4.88818300430819e-05, + "loss": 0.6977, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.39337691074256637, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6695, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.43078872845909777, + "learning_rate": 4.843695574177737e-05, + "loss": 0.7215, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.41677583461043594, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7434, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.37409959960608413, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.6174, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.37530733361044, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6646, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.3948418672519029, + "learning_rate": 4.755137637685979e-05, + "loss": 0.7504, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.3466028849080845, + "learning_rate": 4.733085880741301e-05, + "loss": 0.5993, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.36659625418677094, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.6604, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.37078755725390006, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7182, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.3795044276270682, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.6524, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.33482991952784324, + "learning_rate": 4.645234206515171e-05, + "loss": 0.5969, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.4313167251920351, + "learning_rate": 4.623360864173893e-05, + "loss": 0.7922, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.39092361080138693, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7368, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.3825571527685044, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6939, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.40444046894376534, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6593, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.40668490269779606, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.7082, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.40044672350004745, + "learning_rate": 4.514538954847064e-05, + "loss": 0.677, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.4690244764733942, + "learning_rate": 4.492884557078688e-05, + "loss": 0.7342, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.3793000401009645, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6776, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.37205962385934116, + "learning_rate": 4.449686911058992e-05, + "loss": 0.7073, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3867584388600962, + "learning_rate": 4.428143953045717e-05, + "loss": 0.597, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.3662766708062122, + "learning_rate": 4.406638431438576e-05, + "loss": 0.6005, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.37132830126650684, + "learning_rate": 4.385170490729712e-05, + "loss": 0.652, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.3717335656538945, + "learning_rate": 4.36374027515878e-05, + "loss": 0.6777, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4071587052230116, + "learning_rate": 4.342347928711953e-05, + "loss": 0.5889, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.39199370584020554, + "learning_rate": 4.320993595120969e-05, + "loss": 0.6794, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4083045902562364, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6869, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.4364514601217797, + "learning_rate": 4.278399540155536e-05, + "loss": 0.7299, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.4272030308395742, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6443, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.3830830999836823, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.664, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.45790237792918964, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.7071, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.427243169382132, + "learning_rate": 4.193673880223339e-05, + "loss": 0.6815, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.42608561465983186, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7259, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.35803222200964013, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.6099, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4105132717548988, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6861, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.41160615603238776, + "learning_rate": 4.109572403415386e-05, + "loss": 0.6691, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4125210074420269, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7136, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.3947149593570214, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.625, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.36897199999238556, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6422, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.3989802751482518, + "learning_rate": 4.026104150684835e-05, + "loss": 0.705, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3496190401577539, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6207, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.4900280407157522, + "learning_rate": 3.984610290059467e-05, + "loss": 0.8007, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3551601267452017, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6354, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.38899309001246807, + "learning_rate": 3.943278094912946e-05, + "loss": 0.6374, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3390341726853945, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6266, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.40606795460219025, + "learning_rate": 3.902108676060937e-05, + "loss": 0.6824, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3854419536808137, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6218, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.3897364021862041, + "learning_rate": 3.861103139944449e-05, + "loss": 0.7032, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.46000968922306035, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7736, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.3939240859217374, + "learning_rate": 3.820262588600074e-05, + "loss": 0.6861, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.35864600105529776, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6317, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.39750925834879125, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.7157, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.40159732727993463, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7057, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.4113036440798727, + "learning_rate": 3.739080826174498e-05, + "loss": 0.7105, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.401315698737299, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6668, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.40790670367724563, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.73, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3727532374615525, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6618, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.3503374501600405, + "learning_rate": 3.658572115866541e-05, + "loss": 0.598, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.40563869970756045, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6812, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.3838583386528972, + "learning_rate": 3.618572862711247e-05, + "loss": 0.699, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.40358621332732, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6888, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.3880879506544284, + "learning_rate": 3.578745112405083e-05, + "loss": 0.6911, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.370531095853474, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6404, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.34209538531266953, + "learning_rate": 3.539089935331294e-05, + "loss": 0.5939, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4236755140832168, + "learning_rate": 3.519327394983888e-05, + "loss": 0.7576, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.3658400080148741, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.6159, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.39879796431319914, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6558, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.35087830936122166, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.6117, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.37387322928602545, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6427, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.4872607931728202, + "learning_rate": 3.421170477595419e-05, + "loss": 0.7606, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.38657425724887196, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6303, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.4134505793433531, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.6938, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3598444447813142, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6509, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.3617011879924967, + "learning_rate": 3.34343978560367e-05, + "loss": 0.6574, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.400947050757263, + "learning_rate": 3.324118597838464e-05, + "loss": 0.645, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.4220074193387606, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.7001, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3732493162557107, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6036, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.40301328291104704, + "learning_rate": 3.266424677350346e-05, + "loss": 0.686, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.4108059320184453, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7085, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.4125815276658339, + "learning_rate": 3.228188057393895e-05, + "loss": 0.6322, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.36863849906243684, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6494, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.39074398089237977, + "learning_rate": 3.190133432000252e-05, + "loss": 0.6682, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4584296705306387, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6281, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.38788779905511994, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.6993, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.40701330975481226, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.7688, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.4260094945414775, + "learning_rate": 3.114574250902558e-05, + "loss": 0.7242, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.4057036035467596, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7253, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.40661154553380663, + "learning_rate": 3.077071725875116e-05, + "loss": 0.6711, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.35937310430012653, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6769, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.35787645420113195, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.6513, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.391945553497051, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6367, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.37326197481065054, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.6557, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.36011725309369086, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.623, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.38836463607963373, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.6578, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.3871726490315593, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6655, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.5688223950738495, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7054, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3816671225013523, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6657, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.4080687388597147, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.6535, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.369691368379072, + "learning_rate": 2.874160358524931e-05, + "loss": 0.5877, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.48218290349226944, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.6572, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.34774664869458893, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6428, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.4195235418480639, + "learning_rate": 2.819819423336775e-05, + "loss": 0.7067, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.39283130785285547, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6851, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.3655565612948235, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.628, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3739719837457371, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6403, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.5015560909701027, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.6904, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.39831542125162783, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6953, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.47199034653133004, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6433, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.4101857437841964, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6471, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.3789851658317438, + "learning_rate": 2.677041764010988e-05, + "loss": 0.6639, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.4014035059775117, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6576, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.40684685579476415, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.7052, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4804061527698471, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6641, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.4557510834099288, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.7149, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.39465648052244057, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.7206, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.392256741376962, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.7162, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.40585278793645196, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7213, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.4105575124195198, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.6508, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.3869157683581518, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6491, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.4096673437785533, + "learning_rate": 2.503004759861258e-05, + "loss": 0.6814, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.36368840976953926, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6322, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.41570328821661107, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.658, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.4110249505203712, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6624, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.4646084606799718, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.6641, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.42541671333207876, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6915, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.43528404059096026, + "learning_rate": 2.400992893100822e-05, + "loss": 0.6656, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.3670568988549724, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6284, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.4289843557276394, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.672, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.4135409791582618, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6799, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.3724300445578464, + "learning_rate": 2.334004587234717e-05, + "loss": 0.7023, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5338609126577212, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7982, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.35914040240435635, + "learning_rate": 2.300819024631603e-05, + "loss": 0.6683, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.36043635690058995, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6507, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.409236991133682, + "learning_rate": 2.26784037992395e-05, + "loss": 0.6789, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.35135827201477715, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6044, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.39791158187210746, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.6608, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.36810381494389127, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.7086, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.39214302549414787, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.608, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.3482304751448067, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6041, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.38120946174275433, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.6484, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.40585774138554015, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6494, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.39432294021606884, + "learning_rate": 2.138012622361689e-05, + "loss": 0.6698, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3561326704328232, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6445, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.3920736293397295, + "learning_rate": 2.106081749751897e-05, + "loss": 0.5798, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.4272035629768845, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.7124, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.44778088302282665, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.6948, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.3600814432343671, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6212, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.3230883470653162, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.5683, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.43186613411349173, + "learning_rate": 2.027184594300898e-05, + "loss": 0.7389, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.40047764102962335, + "learning_rate": 2.011565445123711e-05, + "loss": 0.6792, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.3799604559465791, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6382, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.37789477345298844, + "learning_rate": 1.980488270378612e-05, + "loss": 0.6248, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3634941724138379, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.5894, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.41025985607363014, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.7381, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.3969457907755805, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7193, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.37293574757945425, + "learning_rate": 1.918981330958678e-05, + "loss": 0.6555, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.38444359516308163, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6224, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.3565223003207056, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.6304, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.4089104255913111, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7657, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.4613564539632201, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.6718, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.43717369275721013, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7229, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.41569821826961406, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.7392, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.39494922997660814, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7029, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.4122895274541303, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.7089, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.39725854975172903, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6865, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.37464112072190875, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.6389, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.38588828787104884, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6096, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.40372796341268163, + "learning_rate": 1.739698775823442e-05, + "loss": 0.6824, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.4103247224108773, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6726, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.35331636496976654, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.607, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.35955029150196705, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6452, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.44853051305761393, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.7252, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.38700045620030166, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6843, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.36267065747739924, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.6508, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.42430465371526943, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7086, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.4038402527015271, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.6854, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.39445659158549284, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.7104, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.37671583143686765, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.6702, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.40260912294792317, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6585, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.43837270664689515, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.6583, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3893014770348551, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6087, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.36216323834138736, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.5883, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.46412683605673033, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6804, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.40285204013158576, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.7045, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.370915249411144, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6247, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.36105068358775616, + "learning_rate": 1.485810737340767e-05, + "loss": 0.6373, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.4631400901388791, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.7066, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.4372476982607606, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.6773, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.4031985086379773, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6627, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.4077896779241173, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.6703, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.38703008590275734, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6869, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.389822216835128, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.6213, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.44884734309677693, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.698, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.362517575944606, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.6735, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3846628320342674, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6225, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.4059611932840297, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.6448, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.39326903402007896, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6267, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.4012011863886583, + "learning_rate": 1.326814704364262e-05, + "loss": 0.6433, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.363230791976257, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6202, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.3650631404313902, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.6955, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.37561230808243434, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.5872, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.3623780010217766, + "learning_rate": 1.275673273546758e-05, + "loss": 0.5793, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3505105560783126, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6318, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.4271666747293594, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.7364, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3716203795664881, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6343, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.36258914631174205, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.6528, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.35711272822264123, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.5952, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.3878037895522494, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.6925, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.37209329467917407, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6323, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.379412877060846, + "learning_rate": 1.176209418012495e-05, + "loss": 0.6751, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.7891292376304488, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6932, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.4081006916394357, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.6861, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4073538388075811, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.683, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.4414279198048497, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.7051, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3808359904757574, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6484, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.524126867630712, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.7399, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.39951867379220407, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.7088, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.37138460708916754, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.6687, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.34826122882986327, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.5969, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.3410088949598137, + "learning_rate": 1.057219974130903e-05, + "loss": 0.5928, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3950409737979152, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6379, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.4102781207089552, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.6908, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.3429034133578174, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6015, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.3632312736357231, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.5986, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3826228822931815, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6743, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.42683569045701925, + "learning_rate": 9.887052838721322e-06, + "loss": 0.621, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.3605030746262158, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6233, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.41230326194585215, + "learning_rate": 9.663506046162985e-06, + "loss": 0.778, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.4957680011505481, + "learning_rate": 9.552642710005299e-06, + "loss": 0.7457, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.4072547648885756, + "learning_rate": 9.44238707511862e-06, + "loss": 0.645, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.40754438425649125, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6756, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.39653427847439454, + "learning_rate": 9.22370186822965e-06, + "loss": 0.7184, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.43177459051227624, + "learning_rate": 9.115273765538202e-06, + "loss": 0.7636, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.39263907106487356, + "learning_rate": 9.0074563027294e-06, + "loss": 0.7081, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3624972677767638, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6399, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.33843811085319553, + "learning_rate": 8.79365619028507e-06, + "loss": 0.5777, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.34161536055166386, + "learning_rate": 8.687674977138116e-06, + "loss": 0.5904, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.44047485203652753, + "learning_rate": 8.582307276841462e-06, + "loss": 0.7361, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.3918072282659997, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6349, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.3590971052687072, + "learning_rate": 8.37341524246672e-06, + "loss": 0.6231, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.40635778568154485, + "learning_rate": 8.269892311900696e-06, + "loss": 0.7118, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.3788357955249413, + "learning_rate": 8.166985701199582e-06, + "loss": 0.6275, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3888433203841177, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6681, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.420258808376998, + "learning_rate": 7.963024200898462e-06, + "loss": 0.7226, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.409072040849215, + "learning_rate": 7.861970681683051e-06, + "loss": 0.709, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.38157508113901245, + "learning_rate": 7.761536223092458e-06, + "loss": 0.6068, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.3719039482229343, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6442, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.4128679484467576, + "learning_rate": 7.562527182833978e-06, + "loss": 0.6828, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.37493907613039723, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6248, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.47831421098169785, + "learning_rate": 7.366002428553153e-06, + "loss": 0.7748, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.3944876198518206, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6075, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.4602873397181344, + "learning_rate": 7.171967241914224e-06, + "loss": 0.7408, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3915401092739435, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6955, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.42777489757566567, + "learning_rate": 6.980426837673437e-06, + "loss": 0.6815, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3806850442829798, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6528, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.3971572784686375, + "learning_rate": 6.791386363539065e-06, + "loss": 0.657, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3682731235206281, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.67, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.43164071322348224, + "learning_rate": 6.604850900032955e-06, + "loss": 0.6595, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.37823047074358207, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6407, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.413131874588114, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6021, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.365271643610721, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6173, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.37148101784065213, + "learning_rate": 6.239314990243339e-06, + "loss": 0.6373, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3545282942252107, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6235, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.34589376857882104, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.6217, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.37946025391411065, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6679, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.4179883388920593, + "learning_rate": 5.883858403607967e-06, + "loss": 0.7016, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.40606496916550056, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.7358, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.34722654274842113, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.6104, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.5640657778816404, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6893, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.34887272732285135, + "learning_rate": 5.538519351897575e-06, + "loss": 0.676, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.42103436552600304, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6671, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.397458685970188, + "learning_rate": 5.369655545525909e-06, + "loss": 0.6084, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.4560587003423442, + "learning_rate": 5.286177068899989e-06, + "loss": 0.68, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.38853994918812274, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.6739, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4607525754543247, + "learning_rate": 5.121129773156663e-06, + "loss": 0.708, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.35855112978980025, + "learning_rate": 5.039562062965508e-06, + "loss": 0.6008, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.5252440713870019, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6639, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.397145098522711, + "learning_rate": 4.87834125814235e-06, + "loss": 0.6954, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.3947058730584167, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6591, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.39151628751957473, + "learning_rate": 4.719676877632639e-06, + "loss": 0.6555, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3912418614245154, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6689, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.4313141016146209, + "learning_rate": 4.563573185591219e-06, + "loss": 0.7063, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.4077614627946444, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6593, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.35297209831565624, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.6551, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.37755494085747254, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6392, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.43709833891458216, + "learning_rate": 4.259064579323302e-06, + "loss": 0.6573, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.3974583590147279, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6537, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.4071793814200911, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.6909, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.3601247563476641, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5835, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.4482109248429826, + "learning_rate": 3.964848174174541e-06, + "loss": 0.6992, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.46998382865881133, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6443, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.3357347367728721, + "learning_rate": 3.821609474213983e-06, + "loss": 0.613, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3616675167607055, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6755, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.36199504917256037, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.6938, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.38089364449249724, + "learning_rate": 3.611599153858214e-06, + "loss": 0.5978, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.3715196558561812, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.6615, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.36779570170659703, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.639, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.38633718133207556, + "learning_rate": 3.40741737109318e-06, + "loss": 0.6676, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.40549553495675567, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6401, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.43144403204092113, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.7053, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4141940362584789, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6576, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.4351045764338084, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.651, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.3679347474501111, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6398, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.3702780498157318, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.5921, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3713791644136914, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.5971, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.36455460169278, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.5863, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4040556238367524, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6733, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.4197533321599414, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6676, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.4440182289183848, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7207, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.4413081297715539, + "learning_rate": 2.649217248223468e-06, + "loss": 0.6119, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.41931313708560863, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6595, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.4246446041542137, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.6851, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.4277447086602959, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6735, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.41970169765480425, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.675, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.42411333899106246, + "learning_rate": 2.3610579436393e-06, + "loss": 0.7166, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.4161922519419455, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.6544, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3939621504358724, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6234, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.37831960885850807, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.6426, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.41250023097697036, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7239, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.4886043521145033, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.6619, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.40061777538209004, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6733, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.38345316709018273, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.6978, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4299670022115295, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.693, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.373034908425949, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.6308, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3890648152286451, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.7081, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.38355611150181956, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.6301, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.41208221462674705, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6756, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.427348556462753, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6884, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.40449145649417545, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.685, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.41362539723590275, + "learning_rate": 1.595161589389449e-06, + "loss": 0.6555, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.5162633241322898, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.678, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 2.7048013739560886, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.6383, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.38623038093351664, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6599, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.41107408813820095, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.7122, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.3420118306844945, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.5963, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.44860775950775655, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.7645, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.41956358789991066, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6787, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.3996878685357968, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.631, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.4306761188580242, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6034, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.33380908699505984, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.6212, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.380156098174372, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6877, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.48012803001265547, + "learning_rate": 1.089491988176017e-06, + "loss": 0.677, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.36836128639431226, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6486, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.3702855334521945, + "learning_rate": 1.014505010326583e-06, + "loss": 0.6211, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3680229815646194, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5959, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.39723037587968774, + "learning_rate": 9.421782985976068e-07, + "loss": 0.5769, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.3994223107492055, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6779, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.4039869427449052, + "learning_rate": 8.725137967920738e-07, + "loss": 0.7039, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.4103412609606496, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6718, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.3690177610632917, + "learning_rate": 8.055133771652345e-07, + "loss": 0.6632, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.4219196717424533, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6655, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.4018337233141601, + "learning_rate": 7.411788403743237e-07, + "loss": 0.6778, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.35681086222972497, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6222, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.3292173339508648, + "learning_rate": 6.7951191543012e-07, + "loss": 0.5913, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.3936008885386386, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6426, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.43919355223463497, + "learning_rate": 6.205142596505176e-07, + "loss": 0.7145, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3932163380580836, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6133, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.3636623163514951, + "learning_rate": 5.64187458615939e-07, + "loss": 0.6347, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.39874386848746346, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6228, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.39882330921672243, + "learning_rate": 5.105330261267916e-07, + "loss": 0.6561, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.5197317454420752, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7055, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.4221316807203784, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.7113, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4142165008630827, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6967, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.370064302517949, + "learning_rate": 4.112469628438365e-07, + "loss": 0.6371, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3751873246821837, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.673, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.45211984651576587, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.6897, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.39165760279121553, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.612, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.3748915391069381, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.6178, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.43537912317156524, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6543, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.46162976257051774, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.6925, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4029308621400298, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6833, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.33775265813241384, + "learning_rate": 2.448018893333681e-07, + "loss": 0.5976, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.37593495620956857, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6027, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.35309989717011225, + "learning_rate": 2.098903854912515e-07, + "loss": 0.6301, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.39569637592322077, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6415, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.43208602953113595, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.7, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.37194342945756875, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.5833, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.38416040502013293, + "learning_rate": 1.481139151579991e-07, + "loss": 0.6857, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.335484737327022, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6161, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.38061953088212563, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.6358, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.33884120897595343, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6345, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.3934930628376492, + "learning_rate": 9.707157531134713e-08, + "loss": 0.6609, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.39204428051475615, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7247, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.39874105434440155, + "learning_rate": 7.557746412468758e-08, + "loss": 0.7189, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.39100579087505305, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6408, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.36761220334225153, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6091, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.3700637554607591, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6476, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.37325222898220134, + "learning_rate": 4.064624751394242e-08, + "loss": 0.6141, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.3366722732870449, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6474, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.4155578015379489, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.6683, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3892625290957849, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6482, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.4087626640032446, + "learning_rate": 1.646071422083395e-08, + "loss": 0.6054, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.6956867107166412, + "learning_rate": 1.209367398504746e-08, + "loss": 0.677, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.4086757171312014, + "learning_rate": 8.398436437317969e-09, + "loss": 0.6356, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4111855344094051, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7552, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.38432449021004866, + "learning_rate": 3.023464202944748e-09, + "loss": 0.6583, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.4085405245045004, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6984, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.5039455367154145, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.7574, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.47813106625043633, + "learning_rate": 0.0, + "loss": 0.7561, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1128654464909312.0, + "train_loss": 0.7288802340984345, + "train_runtime": 19621.0755, + "train_samples_per_second": 1.019, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1128654464909312.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7420ff54ab7c3b9766a89c31fa2c7cb1f7c3b514 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "o_proj", + "q_proj", + "v_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..afdaf70f264f5b8fea8bc868b3ba8122d7d511a1 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:125fefd8d9ea1985becd336ceaa4bc360bbf3e48f1c6027ac96ec281a8a0a522 +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..a251319c76b0156dc0be44c42f00bbd7b0aaf063 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8495835d7af9abaf3748c69217f36f0d11b20f2f2dbd26d9f978efd43acd004 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c6795165eef5af58159a34f34709057c673a03be --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_20000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.7888862334183341, + "learning_rate": 5.263157894736842e-06, + "loss": 1.2004, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.8072098866777677, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.3362, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 0.934453822221645, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.4111, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.7995143788702597, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3153, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.8045228585693739, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.3712, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.6936886266377906, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1641, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.8405796761351603, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.2383, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6769420946388747, + "learning_rate": 4.210526315789474e-05, + "loss": 1.0148, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.6733265030653172, + "learning_rate": 4.736842105263158e-05, + "loss": 0.9795, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 0.7335159233425952, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0197, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.7045436470282542, + "learning_rate": 5.789473684210527e-05, + "loss": 0.9439, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7589080717502401, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0242, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.653064525079337, + "learning_rate": 6.842105263157895e-05, + "loss": 0.9467, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5349660353486346, + "learning_rate": 7.368421052631579e-05, + "loss": 0.851, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.6474693250698709, + "learning_rate": 7.894736842105263e-05, + "loss": 0.9351, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5013787309930555, + "learning_rate": 8.421052631578948e-05, + "loss": 0.8601, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.6108292538136464, + "learning_rate": 8.947368421052632e-05, + "loss": 0.9248, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5445189776654845, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8735, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.4929435837776125, + "learning_rate": 0.0001, + "loss": 0.8435, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.5305886244534136, + "learning_rate": 0.00010526315789473685, + "loss": 0.9159, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.43968440859680985, + "learning_rate": 0.0001105263157894737, + "loss": 0.9122, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5173469539153005, + "learning_rate": 0.00011578947368421053, + "loss": 0.8479, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5626542058275531, + "learning_rate": 0.00012105263157894738, + "loss": 0.9466, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5841597173947826, + "learning_rate": 0.0001263157894736842, + "loss": 1.0017, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.4951786473740263, + "learning_rate": 0.00013157894736842108, + "loss": 0.8452, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5849316515907934, + "learning_rate": 0.0001368421052631579, + "loss": 1.0015, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.5042752384969739, + "learning_rate": 0.00014210526315789474, + "loss": 0.9226, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4853631688719905, + "learning_rate": 0.00014736842105263158, + "loss": 0.8992, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.531795828719562, + "learning_rate": 0.00015263157894736845, + "loss": 0.9509, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.6638752979985171, + "learning_rate": 0.00015789473684210527, + "loss": 0.9347, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.45278275207606566, + "learning_rate": 0.0001631578947368421, + "loss": 0.8278, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5132822719865044, + "learning_rate": 0.00016842105263157895, + "loss": 0.8264, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.5022331101351186, + "learning_rate": 0.0001736842105263158, + "loss": 0.9087, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.46575474125212074, + "learning_rate": 0.00017894736842105264, + "loss": 0.8606, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.5391928815995534, + "learning_rate": 0.00018421052631578948, + "loss": 0.8977, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.44692616061569684, + "learning_rate": 0.00018947368421052632, + "loss": 0.8352, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.44614250941619077, + "learning_rate": 0.00019473684210526317, + "loss": 0.8583, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.433289640457569, + "learning_rate": 0.0002, + "loss": 0.7968, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.44517420661683976, + "learning_rate": 0.00019999966405802826, + "loss": 0.8654, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.48805276441168305, + "learning_rate": 0.00019999865623437013, + "loss": 0.8959, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.49550073472400363, + "learning_rate": 0.00019999697653579705, + "loss": 0.8294, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.439426740092416, + "learning_rate": 0.00019999462497359466, + "loss": 0.863, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.47191451920826855, + "learning_rate": 0.0001999916015635627, + "loss": 0.8798, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.46627051102404476, + "learning_rate": 0.00019998790632601496, + "loss": 0.8755, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.42721036955736064, + "learning_rate": 0.00019998353928577919, + "loss": 0.778, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.48334276256170616, + "learning_rate": 0.0001999785004721968, + "loss": 0.7948, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.4557680208107082, + "learning_rate": 0.0001999727899191228, + "loss": 0.8604, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.42223230996410294, + "learning_rate": 0.00019996640766492543, + "loss": 0.8521, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.4905567778621406, + "learning_rate": 0.00019995935375248606, + "loss": 0.9098, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.4242061181062434, + "learning_rate": 0.00019995162822919883, + "loss": 0.7911, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.3886571487835808, + "learning_rate": 0.00019994323114697022, + "loss": 0.7497, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.44997119706190275, + "learning_rate": 0.00019993416256221895, + "loss": 0.8616, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.4780377708601232, + "learning_rate": 0.0001999244225358753, + "loss": 0.8795, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4134394710050785, + "learning_rate": 0.00019991401113338104, + "loss": 0.8178, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.4549312991117919, + "learning_rate": 0.00019990292842468868, + "loss": 0.8209, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4943475167911096, + "learning_rate": 0.00019989117448426108, + "loss": 0.8577, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.3981263934035132, + "learning_rate": 0.0001998787493910712, + "loss": 0.8101, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.4791596433231652, + "learning_rate": 0.00019986565322860115, + "loss": 0.9053, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.4301117473556097, + "learning_rate": 0.000199851886084842, + "loss": 0.8568, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.4553403450217621, + "learning_rate": 0.00019983744805229296, + "loss": 0.8905, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.47092985413048144, + "learning_rate": 0.00019982233922796085, + "loss": 0.8354, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.5091746908825188, + "learning_rate": 0.00019980655971335945, + "loss": 0.9167, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.5308818498640732, + "learning_rate": 0.00019979010961450878, + "loss": 0.903, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4467185225835727, + "learning_rate": 0.00019977298904193437, + "loss": 0.8313, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.4802224442428892, + "learning_rate": 0.00019975519811066663, + "loss": 0.85, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.44615653657864884, + "learning_rate": 0.00019973673694024, + "loss": 0.8224, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.4450950170946168, + "learning_rate": 0.0001997176056546921, + "loss": 0.8457, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.43739704558596904, + "learning_rate": 0.00019969780438256293, + "loss": 0.871, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.42492503347720234, + "learning_rate": 0.0001996773332568941, + "loss": 0.8066, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.39277555946078135, + "learning_rate": 0.0001996561924152278, + "loss": 0.7761, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.4547785885866476, + "learning_rate": 0.00019963438199960599, + "loss": 0.8484, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5087443545551769, + "learning_rate": 0.0001996119021565693, + "loss": 0.9514, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.4421694324028681, + "learning_rate": 0.00019958875303715615, + "loss": 0.8356, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.3693836667074255, + "learning_rate": 0.0001995649347969019, + "loss": 0.7293, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.4545505782833525, + "learning_rate": 0.0001995404475958373, + "loss": 0.8674, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.5149111832904231, + "learning_rate": 0.00019951529159848805, + "loss": 0.8317, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.4222308910851471, + "learning_rate": 0.0001994894669738732, + "loss": 0.7796, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.42773268962424477, + "learning_rate": 0.00019946297389550433, + "loss": 0.8301, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.4510758829243804, + "learning_rate": 0.0001994358125413841, + "loss": 0.8175, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.4386879178055002, + "learning_rate": 0.00019940798309400526, + "loss": 0.883, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.4247094243098374, + "learning_rate": 0.0001993794857403495, + "loss": 0.8264, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.39731806722513374, + "learning_rate": 0.0001993503206718859, + "loss": 0.8113, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.43059843269685766, + "learning_rate": 0.0001993204880845699, + "loss": 0.8204, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.46411656604260365, + "learning_rate": 0.00019928998817884182, + "loss": 0.9093, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.42444777199515643, + "learning_rate": 0.00019925882115962568, + "loss": 0.7659, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4126543869436709, + "learning_rate": 0.00019922698723632767, + "loss": 0.7975, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.4749702188611372, + "learning_rate": 0.00019919448662283478, + "loss": 0.85, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3956659659924778, + "learning_rate": 0.00019916131953751342, + "loss": 0.7673, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.4026016211878808, + "learning_rate": 0.00019912748620320794, + "loss": 0.7614, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.4369069700224426, + "learning_rate": 0.00019909298684723904, + "loss": 0.8282, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.46820403041406944, + "learning_rate": 0.00019905782170140238, + "loss": 0.8365, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4993966873868318, + "learning_rate": 0.00019902199100196697, + "loss": 0.8955, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.4559482258642977, + "learning_rate": 0.00019898549498967343, + "loss": 0.7761, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.44067884451502826, + "learning_rate": 0.00019894833390973266, + "loss": 0.8391, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.4669300417941205, + "learning_rate": 0.000198910508011824, + "loss": 0.8143, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4234634201092707, + "learning_rate": 0.00019887201755009357, + "loss": 0.7585, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.4327595021500628, + "learning_rate": 0.00019883286278315262, + "loss": 0.8077, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4640301849434795, + "learning_rate": 0.0001987930439740757, + "loss": 0.8626, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.4323764808876556, + "learning_rate": 0.00019875256139039902, + "loss": 0.7699, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.49768714480089576, + "learning_rate": 0.00019871141530411853, + "loss": 0.8951, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.450340029550535, + "learning_rate": 0.00019866960599168826, + "loss": 0.7956, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.4162610309184303, + "learning_rate": 0.0001986271337340182, + "loss": 0.7993, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.4498058897727285, + "learning_rate": 0.0001985839988164726, + "loss": 0.8608, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4386921106812282, + "learning_rate": 0.00019854020152886814, + "loss": 0.8507, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.4816905812977084, + "learning_rate": 0.00019849574216547171, + "loss": 0.8811, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.3785246676744159, + "learning_rate": 0.0001984506210249986, + "loss": 0.8009, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.44762669321774, + "learning_rate": 0.00019840483841061058, + "loss": 0.8272, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4358359003040998, + "learning_rate": 0.00019835839462991361, + "loss": 0.7362, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.44561339298175845, + "learning_rate": 0.00019831128999495606, + "loss": 0.7828, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.4294708664538283, + "learning_rate": 0.00019826352482222638, + "loss": 0.7471, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.47008937042593385, + "learning_rate": 0.0001982150994326511, + "loss": 0.8299, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4220741908718888, + "learning_rate": 0.00019816601415159263, + "loss": 0.8113, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.4389790650159753, + "learning_rate": 0.0001981162693088471, + "loss": 0.8382, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4388632276201582, + "learning_rate": 0.0001980658652386421, + "loss": 0.9138, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.45562440293018436, + "learning_rate": 0.0001980148022796345, + "loss": 0.872, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4102332619971319, + "learning_rate": 0.00019796308077490817, + "loss": 0.7739, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.39953771833214924, + "learning_rate": 0.00019791070107197153, + "loss": 0.7477, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.46681097789976966, + "learning_rate": 0.00019785766352275542, + "loss": 0.857, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.42500801352275824, + "learning_rate": 0.0001978039684836106, + "loss": 0.8298, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.44011396356640464, + "learning_rate": 0.00019774961631530545, + "loss": 0.8948, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.5114027857148425, + "learning_rate": 0.0001976946073830234, + "loss": 0.8689, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4097399009368278, + "learning_rate": 0.00019763894205636072, + "loss": 0.7868, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.46786182756581846, + "learning_rate": 0.00019758262070932375, + "loss": 0.8498, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.42716743980313704, + "learning_rate": 0.00019752564372032657, + "loss": 0.7826, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.4603939853125694, + "learning_rate": 0.00019746801147218842, + "loss": 0.83, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.47898291377153074, + "learning_rate": 0.00019740972435213115, + "loss": 0.81, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.513393236930296, + "learning_rate": 0.00019735078275177654, + "loss": 0.7822, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.45006012040918797, + "learning_rate": 0.00019729118706714375, + "loss": 0.7338, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.4557955954957533, + "learning_rate": 0.00019723093769864663, + "loss": 0.7859, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.45112089670358757, + "learning_rate": 0.00019717003505109095, + "loss": 0.8281, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.4099562601936963, + "learning_rate": 0.0001971084795336719, + "loss": 0.7823, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4363924382231477, + "learning_rate": 0.00019704627155997108, + "loss": 0.7608, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.4716649317292637, + "learning_rate": 0.00019698341154795389, + "loss": 0.8259, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.46078755661937293, + "learning_rate": 0.00019691989991996663, + "loss": 0.8555, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.4543933387592577, + "learning_rate": 0.00019685573710273376, + "loss": 0.7473, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.41092088159032913, + "learning_rate": 0.0001967909235273549, + "loss": 0.7499, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.5250637195337757, + "learning_rate": 0.00019672545962930215, + "loss": 0.8652, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.42335216858701397, + "learning_rate": 0.00019665934584841682, + "loss": 0.8126, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.4106919019198593, + "learning_rate": 0.00019659258262890683, + "loss": 0.7839, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.4505309788164157, + "learning_rate": 0.00019652517041934356, + "loss": 0.8319, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.47701267935701247, + "learning_rate": 0.00019645710967265882, + "loss": 0.8866, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4602105353793328, + "learning_rate": 0.00019638840084614182, + "loss": 0.9068, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.4211006037566543, + "learning_rate": 0.00019631904440143612, + "loss": 0.7682, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4729031861679495, + "learning_rate": 0.00019624904080453655, + "loss": 0.8815, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.5157834892568381, + "learning_rate": 0.00019617839052578603, + "loss": 0.7583, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.389917286301472, + "learning_rate": 0.00019610709403987246, + "loss": 0.7356, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.8020131752444241, + "learning_rate": 0.0001960351518258255, + "loss": 0.8031, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.41886490199603604, + "learning_rate": 0.00019596256436701324, + "loss": 0.7928, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.4562959619157432, + "learning_rate": 0.00019588933215113926, + "loss": 0.7921, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.43888692499935916, + "learning_rate": 0.000195815455670239, + "loss": 0.8582, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.46833199056820335, + "learning_rate": 0.00019574093542067673, + "loss": 0.7979, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.44478689994491294, + "learning_rate": 0.00019566577190314197, + "loss": 0.8095, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.44454986358483156, + "learning_rate": 0.0001955899656226464, + "loss": 0.8201, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.40740469459807094, + "learning_rate": 0.0001955135170885202, + "loss": 0.7693, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.47108802996625854, + "learning_rate": 0.0001954364268144088, + "loss": 0.8201, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3936299625831791, + "learning_rate": 0.00019535869531826937, + "loss": 0.7713, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.4697125406176289, + "learning_rate": 0.00019528032312236736, + "loss": 0.843, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.40928772702393124, + "learning_rate": 0.00019520131075327298, + "loss": 0.796, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.3734963193919449, + "learning_rate": 0.00019512165874185767, + "loss": 0.8126, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.40137246222831985, + "learning_rate": 0.00019504136762329047, + "loss": 0.7547, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.42022270376452675, + "learning_rate": 0.0001949604379370345, + "loss": 0.7602, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4681716232178868, + "learning_rate": 0.00019487887022684336, + "loss": 0.8285, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.4652249797090735, + "learning_rate": 0.00019479666504075736, + "loss": 0.8439, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.41682731645078525, + "learning_rate": 0.00019471382293110003, + "loss": 0.7696, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.44600530812927036, + "learning_rate": 0.0001946303444544741, + "loss": 0.8058, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.39969615388564694, + "learning_rate": 0.00019454623017175812, + "loss": 0.764, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.39813415475940356, + "learning_rate": 0.00019446148064810242, + "loss": 0.7702, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4760556291734454, + "learning_rate": 0.00019437609645292546, + "loss": 0.8239, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.44706755771174195, + "learning_rate": 0.00019429007815990993, + "loss": 0.8155, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.4502000486187465, + "learning_rate": 0.0001942034263469989, + "loss": 0.8252, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.44058029653007835, + "learning_rate": 0.00019411614159639204, + "loss": 0.7742, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4596690566770355, + "learning_rate": 0.00019402822449454153, + "loss": 0.7982, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.4556227036052518, + "learning_rate": 0.00019393967563214833, + "loss": 0.7844, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.40663953391441715, + "learning_rate": 0.00019385049560415794, + "loss": 0.7605, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.4823398406663074, + "learning_rate": 0.00019376068500975667, + "loss": 0.8604, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4434434432179949, + "learning_rate": 0.00019367024445236754, + "loss": 0.8015, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.4493958616535301, + "learning_rate": 0.000193579174539646, + "loss": 0.85, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4803966578590346, + "learning_rate": 0.00019348747588347637, + "loss": 0.8563, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.43226349981508544, + "learning_rate": 0.00019339514909996706, + "loss": 0.8131, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.47305039724417586, + "learning_rate": 0.00019330219480944694, + "loss": 0.8738, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.43002078414180595, + "learning_rate": 0.00019320861363646095, + "loss": 0.7965, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4205158721935768, + "learning_rate": 0.00019311440620976597, + "loss": 0.7707, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.410085310592379, + "learning_rate": 0.00019301957316232658, + "loss": 0.7176, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.37748157807556615, + "learning_rate": 0.0001929241151313108, + "loss": 0.761, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.42171736108851915, + "learning_rate": 0.0001928280327580858, + "loss": 0.8526, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.42954664943054666, + "learning_rate": 0.00019273132668821364, + "loss": 0.8467, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.42428764414649567, + "learning_rate": 0.00019263399757144683, + "loss": 0.8196, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.47698076811249246, + "learning_rate": 0.00019253604606172417, + "loss": 0.8608, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.4640375687751282, + "learning_rate": 0.000192437472817166, + "loss": 0.8035, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.4463228546319291, + "learning_rate": 0.00019233827850007027, + "loss": 0.7651, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.4896046005460382, + "learning_rate": 0.00019223846377690754, + "loss": 0.7356, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.46765869984639236, + "learning_rate": 0.00019213802931831696, + "loss": 0.8544, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.4764711702893882, + "learning_rate": 0.00019203697579910154, + "loss": 0.8524, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.42707169579189624, + "learning_rate": 0.00019193530389822363, + "loss": 0.7988, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.4212467340691912, + "learning_rate": 0.00019183301429880043, + "loss": 0.7765, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.45757237942238316, + "learning_rate": 0.00019173010768809933, + "loss": 0.8578, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.4094434644819681, + "learning_rate": 0.00019162658475753327, + "loss": 0.7893, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.40223255624486415, + "learning_rate": 0.0001915224462026563, + "loss": 0.7402, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.4622108299481829, + "learning_rate": 0.00019141769272315858, + "loss": 0.7591, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.3964607159201426, + "learning_rate": 0.00019131232502286188, + "loss": 0.707, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.4270168927998998, + "learning_rate": 0.00019120634380971496, + "loss": 0.8036, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.3920550741841073, + "learning_rate": 0.0001910997497957885, + "loss": 0.7356, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.452142428420456, + "learning_rate": 0.0001909925436972706, + "loss": 0.7425, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.44341855235588906, + "learning_rate": 0.00019088472623446183, + "loss": 0.8787, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.4103030809347348, + "learning_rate": 0.00019077629813177036, + "loss": 0.7727, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.40090861914740916, + "learning_rate": 0.00019066726011770726, + "loss": 0.7651, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.3951992588577195, + "learning_rate": 0.00019055761292488142, + "loss": 0.7535, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.5164300583980236, + "learning_rate": 0.0001904473572899947, + "loss": 0.8772, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.40493635384459464, + "learning_rate": 0.00019033649395383702, + "loss": 0.7572, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.4996389939569627, + "learning_rate": 0.00019022502366128135, + "loss": 0.7533, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.41091072724636457, + "learning_rate": 0.00019011294716127867, + "loss": 0.75, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.42158327729701495, + "learning_rate": 0.00019000026520685302, + "loss": 0.7546, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.4631383500453912, + "learning_rate": 0.0001898869785550963, + "loss": 0.8005, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.39620595629829286, + "learning_rate": 0.0001897730879671634, + "loss": 0.7487, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.42423850878663344, + "learning_rate": 0.00018965859420826684, + "loss": 0.8103, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4262639407196757, + "learning_rate": 0.00018954349804767184, + "loss": 0.6982, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.45313481608455014, + "learning_rate": 0.00018942780025869098, + "loss": 0.8694, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4213851398375332, + "learning_rate": 0.00018931150161867916, + "loss": 0.7663, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.4796064199007877, + "learning_rate": 0.00018919460290902826, + "loss": 0.8103, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.40359670606514453, + "learning_rate": 0.00018907710491516199, + "loss": 0.7737, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.42517228222089504, + "learning_rate": 0.0001889590084265304, + "loss": 0.8458, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.39853364630036775, + "learning_rate": 0.0001888403142366049, + "loss": 0.8487, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.42701320403105236, + "learning_rate": 0.0001887210231428727, + "loss": 0.7836, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4751710203159611, + "learning_rate": 0.00018860113594683148, + "loss": 0.8199, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.4177959997718054, + "learning_rate": 0.0001884806534539841, + "loss": 0.7856, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.47126589535950586, + "learning_rate": 0.00018835957647383303, + "loss": 0.8601, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.40956026611263685, + "learning_rate": 0.0001882379058198751, + "loss": 0.7722, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3786410889782343, + "learning_rate": 0.00018811564230959588, + "loss": 0.7358, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.42441607724963093, + "learning_rate": 0.00018799278676446423, + "loss": 0.8128, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.3931184022834064, + "learning_rate": 0.00018786934000992688, + "loss": 0.7754, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.4676541594064247, + "learning_rate": 0.00018774530287540278, + "loss": 0.8533, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.41772326185138137, + "learning_rate": 0.00018762067619427746, + "loss": 0.7974, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.42568144869884195, + "learning_rate": 0.00018749546080389757, + "loss": 0.7934, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.41667139076366067, + "learning_rate": 0.00018736965754556528, + "loss": 0.7462, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.3894407998600845, + "learning_rate": 0.00018724326726453244, + "loss": 0.7018, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3968628191885499, + "learning_rate": 0.00018711629080999504, + "loss": 0.7229, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.4311578529579322, + "learning_rate": 0.00018698872903508755, + "loss": 0.7536, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4454357510677708, + "learning_rate": 0.00018686058279687698, + "loss": 0.7686, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.4397625012035176, + "learning_rate": 0.0001867318529563574, + "loss": 0.8069, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.4175749180024889, + "learning_rate": 0.00018660254037844388, + "loss": 0.7943, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.48026801501164645, + "learning_rate": 0.00018647264593196688, + "loss": 0.8252, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.43837906517453124, + "learning_rate": 0.00018634217048966637, + "loss": 0.8318, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.40552130965049665, + "learning_rate": 0.00018621111492818585, + "loss": 0.7579, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.41965573044241067, + "learning_rate": 0.0001860794801280666, + "loss": 0.7377, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.41379905946624335, + "learning_rate": 0.00018594726697374175, + "loss": 0.7916, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.39928721320339905, + "learning_rate": 0.0001858144763535302, + "loss": 0.7484, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.39695857145017316, + "learning_rate": 0.0001856811091596308, + "loss": 0.8062, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4257439687221929, + "learning_rate": 0.0001855471662881164, + "loss": 0.8251, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.3935361187026131, + "learning_rate": 0.00018541264863892754, + "loss": 0.7724, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.37834209183020484, + "learning_rate": 0.00018527755711586678, + "loss": 0.7611, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.40419910202390763, + "learning_rate": 0.00018514189262659235, + "loss": 0.7845, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4653326295828301, + "learning_rate": 0.00018500565608261214, + "loss": 0.8503, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.43315318696524024, + "learning_rate": 0.00018486884839927768, + "loss": 0.8144, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.4417707396554827, + "learning_rate": 0.00018473147049577774, + "loss": 0.8415, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.4248813489576955, + "learning_rate": 0.0001845935232951325, + "loss": 0.8443, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4033508599201362, + "learning_rate": 0.00018445500772418697, + "loss": 0.7995, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.4317865167024778, + "learning_rate": 0.00018431592471360503, + "loss": 0.7791, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.40264945344040487, + "learning_rate": 0.00018417627519786315, + "loss": 0.8031, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.35295010248954845, + "learning_rate": 0.000184036060115244, + "loss": 0.7309, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.4605989535515705, + "learning_rate": 0.00018389528040783012, + "loss": 0.865, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.41132925305495854, + "learning_rate": 0.00018375393702149787, + "loss": 0.7421, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.4567431675996166, + "learning_rate": 0.00018361203090591071, + "loss": 0.8196, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.4544080429648433, + "learning_rate": 0.00018346956301451304, + "loss": 0.8747, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.43602304381113133, + "learning_rate": 0.00018332653430452376, + "loss": 0.8268, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.3680958867209954, + "learning_rate": 0.00018318294573692985, + "loss": 0.7274, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.41981614126886263, + "learning_rate": 0.00018303879827647975, + "loss": 0.7739, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.45534292093001205, + "learning_rate": 0.0001828940928916772, + "loss": 0.843, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4176930406124194, + "learning_rate": 0.00018274883055477436, + "loss": 0.76, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.4250593407694702, + "learning_rate": 0.00018260301224176558, + "loss": 0.8059, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.385015379342669, + "learning_rate": 0.00018245663893238075, + "loss": 0.7183, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.4049753323831426, + "learning_rate": 0.00018230971161007853, + "loss": 0.7731, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4291693562623502, + "learning_rate": 0.00018216223126204007, + "loss": 0.8287, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.42908607036423363, + "learning_rate": 0.00018201419887916214, + "loss": 0.804, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.41143273147413484, + "learning_rate": 0.00018186561545605054, + "loss": 0.7958, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.4219014010666351, + "learning_rate": 0.00018171648199101346, + "loss": 0.7564, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.40528270916888237, + "learning_rate": 0.00018156679948605467, + "loss": 0.8199, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.44757517887666887, + "learning_rate": 0.00018141656894686689, + "loss": 0.7724, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.42027873904868446, + "learning_rate": 0.00018126579138282503, + "loss": 0.8265, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.4148304854268926, + "learning_rate": 0.00018111446780697929, + "loss": 0.8756, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.38377520871651966, + "learning_rate": 0.0001809625992360485, + "loss": 0.7468, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.4264543015588282, + "learning_rate": 0.00018081018669041324, + "loss": 0.8039, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4136300149086464, + "learning_rate": 0.00018065723119410884, + "loss": 0.8031, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.36648543430232305, + "learning_rate": 0.00018050373377481878, + "loss": 0.6646, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4522172811815767, + "learning_rate": 0.00018034969546386757, + "loss": 0.8701, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.43740644922232075, + "learning_rate": 0.0001801951172962139, + "loss": 0.8038, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4174595987164087, + "learning_rate": 0.0001800400003104436, + "loss": 0.7608, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.393744945703445, + "learning_rate": 0.0001798843455487629, + "loss": 0.7008, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4894690003563711, + "learning_rate": 0.00017972815405699103, + "loss": 0.8372, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.4217244008041063, + "learning_rate": 0.00017957142688455362, + "loss": 0.823, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.40260737765643384, + "learning_rate": 0.00017941416508447536, + "loss": 0.76, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.40243360935749045, + "learning_rate": 0.00017925636971337304, + "loss": 0.801, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4348442831307786, + "learning_rate": 0.0001790980418314484, + "loss": 0.8471, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.43757909427752245, + "learning_rate": 0.00017893918250248104, + "loss": 0.8046, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.4280554526661448, + "learning_rate": 0.00017877979279382135, + "loss": 0.7937, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.3803413146307526, + "learning_rate": 0.00017861987377638312, + "loss": 0.7347, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4256064404247689, + "learning_rate": 0.0001784594265246366, + "loss": 0.7465, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.3954708369493568, + "learning_rate": 0.0001782984521166011, + "loss": 0.7141, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.34956731911658284, + "learning_rate": 0.0001781369516338378, + "loss": 0.6944, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.4281109069360621, + "learning_rate": 0.00017797492616144256, + "loss": 0.7542, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.38735441498537176, + "learning_rate": 0.00017781237678803847, + "loss": 0.6906, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.4804515522714117, + "learning_rate": 0.00017764930460576866, + "loss": 0.8035, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.438669543093096, + "learning_rate": 0.000177485710710289, + "loss": 0.8029, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.42012676468526255, + "learning_rate": 0.00017732159620076053, + "loss": 0.8088, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5101640884332544, + "learning_rate": 0.00017715696217984235, + "loss": 0.8541, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.3703113726642919, + "learning_rate": 0.00017699180975368396, + "loss": 0.7049, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.46783269643660297, + "learning_rate": 0.00017682614003191807, + "loss": 0.8282, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.38274836146236746, + "learning_rate": 0.00017665995412765285, + "loss": 0.7117, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.44467611676383684, + "learning_rate": 0.00017649325315746478, + "loss": 0.7633, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.39640522112939164, + "learning_rate": 0.00017632603824139085, + "loss": 0.7628, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.4608950588902465, + "learning_rate": 0.0001761583105029213, + "loss": 0.7968, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.40004221944472373, + "learning_rate": 0.0001759900710689918, + "loss": 0.7359, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4342650667250139, + "learning_rate": 0.00017582132106997616, + "loss": 0.8056, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.40171348460939127, + "learning_rate": 0.00017565206163967846, + "loss": 0.7035, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.36209279422253543, + "learning_rate": 0.00017548229391532572, + "loss": 0.6508, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.4034756327472997, + "learning_rate": 0.00017531201903755994, + "loss": 0.7368, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.39249218081928117, + "learning_rate": 0.00017514123815043074, + "loss": 0.7819, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.42471256142276875, + "learning_rate": 0.00017496995240138744, + "loss": 0.7308, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4180759336676178, + "learning_rate": 0.00017479816294127152, + "loss": 0.7141, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.46855118885752606, + "learning_rate": 0.00017462587092430875, + "loss": 0.7186, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.4337432879128543, + "learning_rate": 0.0001744530775081015, + "loss": 0.7704, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.4439354117273041, + "learning_rate": 0.00017427978385362112, + "loss": 0.832, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4074274598145603, + "learning_rate": 0.0001741059911251997, + "loss": 0.7478, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.38959870273386044, + "learning_rate": 0.0001739317004905227, + "loss": 0.6937, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4582004248722656, + "learning_rate": 0.000173756913120621, + "loss": 0.8008, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.3956315002460586, + "learning_rate": 0.00017358163018986282, + "loss": 0.7315, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4262268133448119, + "learning_rate": 0.00017340585287594604, + "loss": 0.819, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.391879323864864, + "learning_rate": 0.00017322958235989016, + "loss": 0.7493, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4274038802228928, + "learning_rate": 0.0001730528198260285, + "loss": 0.7617, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.4290256337161772, + "learning_rate": 0.00017287556646200018, + "loss": 0.7613, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.37952638899241375, + "learning_rate": 0.00017269782345874203, + "loss": 0.6778, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.3826196285241614, + "learning_rate": 0.00017251959201048083, + "loss": 0.6832, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4050950810614008, + "learning_rate": 0.00017234087331472497, + "loss": 0.7675, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.37958456133326496, + "learning_rate": 0.00017216166857225674, + "loss": 0.6923, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.48525278577734343, + "learning_rate": 0.00017198197898712404, + "loss": 0.8732, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.4350053478588795, + "learning_rate": 0.00017180180576663228, + "loss": 0.8142, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.44116393153626615, + "learning_rate": 0.00017162115012133643, + "loss": 0.8282, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.4292115231398351, + "learning_rate": 0.00017144001326503273, + "loss": 0.7402, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.40532916649912354, + "learning_rate": 0.00017125839641475072, + "loss": 0.7299, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.41032432819557796, + "learning_rate": 0.00017107630079074478, + "loss": 0.7953, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.44426998765207043, + "learning_rate": 0.00017089372761648616, + "loss": 0.8494, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.47851547077634976, + "learning_rate": 0.00017071067811865476, + "loss": 0.7823, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.3798157359553566, + "learning_rate": 0.00017052715352713075, + "loss": 0.7411, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.4116387079907209, + "learning_rate": 0.00017034315507498635, + "loss": 0.7693, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4364919120832264, + "learning_rate": 0.00017015868399847768, + "loss": 0.7789, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.41713361743005883, + "learning_rate": 0.00016997374153703625, + "loss": 0.7379, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4206448578876803, + "learning_rate": 0.00016978832893326074, + "loss": 0.7692, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.4592632053929459, + "learning_rate": 0.00016960244743290868, + "loss": 0.82, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4523816657682658, + "learning_rate": 0.00016941609828488807, + "loss": 0.7709, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.38625339507517903, + "learning_rate": 0.00016922928274124886, + "loss": 0.7234, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.38691327725973956, + "learning_rate": 0.0001690420020571747, + "loss": 0.6854, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.5078844842218331, + "learning_rate": 0.00016885425749097444, + "loss": 0.8543, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4385294481594214, + "learning_rate": 0.0001686660503040737, + "loss": 0.8188, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.3673088434620512, + "learning_rate": 0.00016847738176100632, + "loss": 0.6995, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.3746665041857716, + "learning_rate": 0.00016828825312940592, + "loss": 0.7012, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.3784264474612318, + "learning_rate": 0.0001680986656799975, + "loss": 0.6636, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.42920592262138246, + "learning_rate": 0.0001679086206865886, + "loss": 0.8046, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.38639789811582664, + "learning_rate": 0.00016771811942606108, + "loss": 0.7038, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.453939261311069, + "learning_rate": 0.00016752716317836229, + "loss": 0.8008, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.5125707908362529, + "learning_rate": 0.00016733575322649657, + "loss": 0.833, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.4053712383468021, + "learning_rate": 0.0001671438908565167, + "loss": 0.8229, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.4273953244431224, + "learning_rate": 0.00016695157735751513, + "loss": 0.792, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.40758364381427453, + "learning_rate": 0.00016675881402161536, + "loss": 0.7753, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.46744613888999914, + "learning_rate": 0.0001665656021439633, + "loss": 0.8121, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.42243568027474676, + "learning_rate": 0.0001663719430227186, + "loss": 0.7225, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.42334249817039404, + "learning_rate": 0.00016617783795904565, + "loss": 0.7629, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.41496346524495037, + "learning_rate": 0.00016598328825710533, + "loss": 0.6595, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.4041449703070248, + "learning_rate": 0.00016578829522404583, + "loss": 0.76, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4602823503849498, + "learning_rate": 0.000165592860169994, + "loss": 0.7463, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.45829102214665335, + "learning_rate": 0.00016539698440804661, + "loss": 0.807, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.5194444205744141, + "learning_rate": 0.00016520066925426144, + "loss": 0.8964, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.4274362092511141, + "learning_rate": 0.0001650039160276485, + "loss": 0.7341, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.48382529105481803, + "learning_rate": 0.0001648067260501611, + "loss": 0.8081, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.43814380428013716, + "learning_rate": 0.0001646091006466871, + "loss": 0.7683, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.45029893258300174, + "learning_rate": 0.0001644110411450398, + "loss": 0.796, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.4400905848459202, + "learning_rate": 0.00016421254887594917, + "loss": 0.7439, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.46507271493682933, + "learning_rate": 0.00016401362517305296, + "loss": 0.7493, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.470863177488771, + "learning_rate": 0.00016381427137288754, + "loss": 0.7696, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4232176350661182, + "learning_rate": 0.00016361448881487914, + "loss": 0.7332, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.44400709128201865, + "learning_rate": 0.0001634142788413346, + "loss": 0.8178, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.41675859512276436, + "learning_rate": 0.00016321364279743266, + "loss": 0.7634, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.3989561799033566, + "learning_rate": 0.00016301258203121462, + "loss": 0.7214, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.47417378039265345, + "learning_rate": 0.0001628110978935756, + "loss": 0.8712, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.3907988356051514, + "learning_rate": 0.00016260919173825508, + "loss": 0.7742, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.40727263958681575, + "learning_rate": 0.00016240686492182804, + "loss": 0.7074, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.4120998531328911, + "learning_rate": 0.00016220411880369601, + "loss": 0.794, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.3994194092341835, + "learning_rate": 0.00016200095474607753, + "loss": 0.7515, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.4790153356094912, + "learning_rate": 0.00016179737411399926, + "loss": 0.8656, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.40077998577507146, + "learning_rate": 0.00016159337827528685, + "loss": 0.7278, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.38512668766662994, + "learning_rate": 0.00016138896860055555, + "loss": 0.7802, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.4011914166317225, + "learning_rate": 0.0001611841464632011, + "loss": 0.7842, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.43022389714437426, + "learning_rate": 0.00016097891323939062, + "loss": 0.7656, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.39144444511121446, + "learning_rate": 0.0001607732703080532, + "loss": 0.744, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.3694560310117428, + "learning_rate": 0.00016056721905087056, + "loss": 0.7312, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.4234341894045398, + "learning_rate": 0.00016036076085226814, + "loss": 0.8192, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.43378554475513786, + "learning_rate": 0.00016015389709940538, + "loss": 0.7725, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3926939490195302, + "learning_rate": 0.0001599466291821666, + "loss": 0.7287, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.4353029147339415, + "learning_rate": 0.0001597389584931517, + "loss": 0.7806, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.4104470890785654, + "learning_rate": 0.0001595308864276666, + "loss": 0.7792, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.4276700358980658, + "learning_rate": 0.0001593224143837142, + "loss": 0.7555, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.4938143578839374, + "learning_rate": 0.0001591135437619847, + "loss": 0.8334, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.42818147606374335, + "learning_rate": 0.00015890427596584617, + "loss": 0.8428, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.41406947240352393, + "learning_rate": 0.0001586946124013354, + "loss": 0.796, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.44281639260862904, + "learning_rate": 0.00015848455447714822, + "loss": 0.7398, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4158875186808861, + "learning_rate": 0.0001582741036046301, + "loss": 0.7935, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.518693050154203, + "learning_rate": 0.00015806326119776663, + "loss": 0.8845, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.43617889754344386, + "learning_rate": 0.00015785202867317407, + "loss": 0.738, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.41393504365642786, + "learning_rate": 0.00015764040745008988, + "loss": 0.7662, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3916505340881447, + "learning_rate": 0.00015742839895036305, + "loss": 0.7218, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.40893671612613586, + "learning_rate": 0.00015721600459844468, + "loss": 0.7065, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.42860148639099166, + "learning_rate": 0.00015700322582137827, + "loss": 0.7666, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.3770999407298945, + "learning_rate": 0.00015679006404879033, + "loss": 0.7275, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3773561415348383, + "learning_rate": 0.0001565765207128805, + "loss": 0.725, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.42739677259444697, + "learning_rate": 0.00015636259724841222, + "loss": 0.713, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3746420515089487, + "learning_rate": 0.0001561482950927029, + "loss": 0.7082, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.45154432295964797, + "learning_rate": 0.00015593361568561428, + "loss": 0.8361, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.4154008294171299, + "learning_rate": 0.00015571856046954285, + "loss": 0.7281, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.41672996846450994, + "learning_rate": 0.0001555031308894101, + "loss": 0.7824, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4050763828119448, + "learning_rate": 0.00015528732839265272, + "loss": 0.7457, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.3578301373838656, + "learning_rate": 0.0001550711544292131, + "loss": 0.696, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.4112973121993917, + "learning_rate": 0.0001548546104515294, + "loss": 0.7459, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.40075180843741315, + "learning_rate": 0.00015463769791452574, + "loss": 0.7239, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.3843413424730113, + "learning_rate": 0.00015442041827560274, + "loss": 0.7797, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.47035153771457855, + "learning_rate": 0.00015420277299462736, + "loss": 0.8651, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4043920327079687, + "learning_rate": 0.00015398476353392323, + "loss": 0.759, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.4099403010516249, + "learning_rate": 0.00015376639135826107, + "loss": 0.7971, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4209734147382475, + "learning_rate": 0.00015354765793484834, + "loss": 0.7346, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.3637299336475154, + "learning_rate": 0.00015332856473331978, + "loss": 0.655, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.3917734710599, + "learning_rate": 0.00015310911322572753, + "loss": 0.7769, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.4185800922042766, + "learning_rate": 0.00015288930488653094, + "loss": 0.8091, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.4072904979262855, + "learning_rate": 0.000152669141192587, + "loss": 0.7374, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.4051590420213116, + "learning_rate": 0.0001524486236231402, + "loss": 0.8019, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3456562009787237, + "learning_rate": 0.00015222775365981273, + "loss": 0.6344, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.45860809030378363, + "learning_rate": 0.00015200653278659432, + "loss": 0.7723, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4233875124766856, + "learning_rate": 0.00015178496248983254, + "loss": 0.7464, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.46047697958856054, + "learning_rate": 0.00015156304425822267, + "loss": 0.786, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4482851013634399, + "learning_rate": 0.00015134077958279765, + "loss": 0.7635, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.41374354113559086, + "learning_rate": 0.00015111816995691809, + "loss": 0.7731, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.5537275439205439, + "learning_rate": 0.00015089521687626243, + "loss": 0.7183, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.4090471845722831, + "learning_rate": 0.00015067192183881658, + "loss": 0.7399, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.4121289703575594, + "learning_rate": 0.000150448286344864, + "loss": 0.7735, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.5004477427745413, + "learning_rate": 0.00015022431189697568, + "loss": 0.8698, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.44573437201228233, + "learning_rate": 0.00015000000000000001, + "loss": 0.686, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.4104756936070795, + "learning_rate": 0.0001497753521610526, + "loss": 0.7473, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3481804037211295, + "learning_rate": 0.00014955036988950618, + "loss": 0.6573, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.39354978860410955, + "learning_rate": 0.00014932505469698052, + "loss": 0.7049, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.38871727928399735, + "learning_rate": 0.00014909940809733222, + "loss": 0.6936, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.3971615200108154, + "learning_rate": 0.0001488734316066446, + "loss": 0.629, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4026758153103403, + "learning_rate": 0.00014864712674321734, + "loss": 0.7671, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.42621146792279624, + "learning_rate": 0.0001484204950275565, + "loss": 0.7886, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.40719481023108833, + "learning_rate": 0.00014819353798236427, + "loss": 0.7683, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.4505840013211492, + "learning_rate": 0.00014796625713252848, + "loss": 0.7966, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.41545457724746276, + "learning_rate": 0.00014773865400511272, + "loss": 0.7301, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.3645602116722643, + "learning_rate": 0.00014751073012934587, + "loss": 0.6745, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4137319078060829, + "learning_rate": 0.00014728248703661182, + "loss": 0.7097, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.41434719090532385, + "learning_rate": 0.0001470539262604393, + "loss": 0.7283, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4189836301797749, + "learning_rate": 0.00014682504933649144, + "loss": 0.7514, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.4446067879616907, + "learning_rate": 0.00014659585780255556, + "loss": 0.7609, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.3855150992819854, + "learning_rate": 0.00014636635319853275, + "loss": 0.7222, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.4186352163434588, + "learning_rate": 0.0001461365370664276, + "loss": 0.7024, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.43010146235663815, + "learning_rate": 0.00014590641095033787, + "loss": 0.8027, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.3920652110717218, + "learning_rate": 0.00014567597639644387, + "loss": 0.6709, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3764836291055801, + "learning_rate": 0.00014544523495299842, + "loss": 0.6964, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.4054858421959242, + "learning_rate": 0.00014521418817031628, + "loss": 0.8018, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.37245905228447623, + "learning_rate": 0.0001449828376007636, + "loss": 0.6737, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.38370190144262273, + "learning_rate": 0.00014475118479874774, + "loss": 0.7373, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.3709038780796342, + "learning_rate": 0.0001445192313207067, + "loss": 0.7489, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.43629126754331155, + "learning_rate": 0.0001442869787250987, + "loss": 0.7166, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.46491387794401623, + "learning_rate": 0.0001440544285723915, + "loss": 0.8376, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.466085825819284, + "learning_rate": 0.00014382158242505234, + "loss": 0.783, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.44202181643730803, + "learning_rate": 0.00014358844184753712, + "loss": 0.7547, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.4072741860029995, + "learning_rate": 0.00014335500840627986, + "loss": 0.7164, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4275506204938012, + "learning_rate": 0.00014312128366968243, + "loss": 0.7707, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.3857298764352471, + "learning_rate": 0.0001428872692081038, + "loss": 0.7133, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.37400963436434526, + "learning_rate": 0.00014265296659384956, + "loss": 0.656, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.40789460594727933, + "learning_rate": 0.00014241837740116132, + "loss": 0.7493, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.43917867571684516, + "learning_rate": 0.00014218350320620624, + "loss": 0.7404, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.38770506133101995, + "learning_rate": 0.00014194834558706632, + "loss": 0.6963, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.4605066053260177, + "learning_rate": 0.0001417129061237278, + "loss": 0.7401, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.3623723615249701, + "learning_rate": 0.0001414771863980707, + "loss": 0.6946, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.472597365619049, + "learning_rate": 0.00014124118799385796, + "loss": 0.849, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.38059398536400085, + "learning_rate": 0.00014100491249672498, + "loss": 0.6829, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4419328471189818, + "learning_rate": 0.00014076836149416887, + "loss": 0.7706, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.4174488873084172, + "learning_rate": 0.0001405315365755379, + "loss": 0.7806, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.37995649754107935, + "learning_rate": 0.0001402944393320206, + "loss": 0.7073, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.37905339289160617, + "learning_rate": 0.00014005707135663527, + "loss": 0.7059, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.39285244383934415, + "learning_rate": 0.00013981943424421932, + "loss": 0.7278, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.40403149537453, + "learning_rate": 0.00013958152959141825, + "loss": 0.7724, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.45547737889839607, + "learning_rate": 0.00013934335899667527, + "loss": 0.7227, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.39041841421012613, + "learning_rate": 0.00013910492406022033, + "loss": 0.7022, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.4409379938077856, + "learning_rate": 0.00013886622638405952, + "loss": 0.7887, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.40077109270195066, + "learning_rate": 0.0001386272675719642, + "loss": 0.7368, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3868223095035165, + "learning_rate": 0.00013838804922946027, + "loss": 0.6246, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.4553742997236335, + "learning_rate": 0.00013814857296381728, + "loss": 0.7501, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4312707891511658, + "learning_rate": 0.00013790884038403795, + "loss": 0.7813, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.3825011866495948, + "learning_rate": 0.00013766885310084688, + "loss": 0.7621, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4005274079834418, + "learning_rate": 0.00013742861272668012, + "loss": 0.7327, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.47541170393500026, + "learning_rate": 0.00013718812087567414, + "loss": 0.8572, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.39858455064952547, + "learning_rate": 0.00013694737916365517, + "loss": 0.7918, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.35876780138681247, + "learning_rate": 0.000136706389208128, + "loss": 0.6632, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.4132380480989518, + "learning_rate": 0.00013646515262826552, + "loss": 0.769, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.41498450091839245, + "learning_rate": 0.00013622367104489756, + "loss": 0.7731, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.41062942938318003, + "learning_rate": 0.0001359819460805001, + "loss": 0.7176, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.4241205793650312, + "learning_rate": 0.0001357399793591844, + "loss": 0.8063, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.45146142335068495, + "learning_rate": 0.0001354977725066859, + "loss": 0.7076, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.4255936702727934, + "learning_rate": 0.00013525532715035366, + "loss": 0.7368, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.3725789711645719, + "learning_rate": 0.00013501264491913906, + "loss": 0.6377, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.4281771555109678, + "learning_rate": 0.00013476972744358507, + "loss": 0.7534, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4166453240078789, + "learning_rate": 0.0001345265763558152, + "loss": 0.7519, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.3935184546178131, + "learning_rate": 0.00013428319328952253, + "loss": 0.7095, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.4126976340784094, + "learning_rate": 0.00013403957987995882, + "loss": 0.7341, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.3863881482536955, + "learning_rate": 0.0001337957377639235, + "loss": 0.7071, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.40377691461157295, + "learning_rate": 0.0001335516685797525, + "loss": 0.7482, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.4043391983877137, + "learning_rate": 0.0001333073739673076, + "loss": 0.7771, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4404424404024769, + "learning_rate": 0.00013306285556796495, + "loss": 0.8005, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.40512715034400004, + "learning_rate": 0.0001328181150246045, + "loss": 0.7869, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3794249661172897, + "learning_rate": 0.00013257315398159864, + "loss": 0.7199, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.36643121228895176, + "learning_rate": 0.00013232797408480127, + "loss": 0.6775, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.41580145081995074, + "learning_rate": 0.00013208257698153677, + "loss": 0.786, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.3629299406899404, + "learning_rate": 0.00013183696432058888, + "loss": 0.6818, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.4351438441221242, + "learning_rate": 0.00013159113775218964, + "loss": 0.786, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.4115001032600809, + "learning_rate": 0.00013134509892800822, + "loss": 0.779, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4333855714956754, + "learning_rate": 0.00013109884950114007, + "loss": 0.7593, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.37380437241387643, + "learning_rate": 0.00013085239112609547, + "loss": 0.7469, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.40366060949773175, + "learning_rate": 0.00013060572545878875, + "loss": 0.7328, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.39096397503334457, + "learning_rate": 0.00013035885415652685, + "loss": 0.7299, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4060822153997327, + "learning_rate": 0.00013011177887799845, + "loss": 0.7612, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.33592868000345044, + "learning_rate": 0.00012986450128326266, + "loss": 0.6428, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.37586206358484725, + "learning_rate": 0.00012961702303373795, + "loss": 0.7068, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.36027202021666715, + "learning_rate": 0.00012936934579219094, + "loss": 0.6888, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.3805555609387189, + "learning_rate": 0.00012912147122272523, + "loss": 0.7545, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.3950942606832783, + "learning_rate": 0.00012887340099077024, + "loss": 0.7705, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.33351189312432067, + "learning_rate": 0.00012862513676307008, + "loss": 0.6195, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.41876839923857423, + "learning_rate": 0.0001283766802076722, + "loss": 0.7071, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.38472473714600236, + "learning_rate": 0.00012812803299391628, + "loss": 0.6606, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.3843758442692758, + "learning_rate": 0.00012787919679242306, + "loss": 0.693, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4283657231928018, + "learning_rate": 0.00012763017327508305, + "loss": 0.7061, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.42564365770391577, + "learning_rate": 0.00012738096411504522, + "loss": 0.7592, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.4194372565938139, + "learning_rate": 0.0001271315709867059, + "loss": 0.7163, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.3443994575649735, + "learning_rate": 0.00012688199556569753, + "loss": 0.6153, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.39972130259342203, + "learning_rate": 0.00012663223952887723, + "loss": 0.7222, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.38675070762078495, + "learning_rate": 0.0001263823045543158, + "loss": 0.6423, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.41841132296140043, + "learning_rate": 0.00012613219232128608, + "loss": 0.8016, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.4167829979368189, + "learning_rate": 0.00012588190451025207, + "loss": 0.7683, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3876578397333228, + "learning_rate": 0.00012563144280285741, + "loss": 0.7366, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.45360324255725454, + "learning_rate": 0.00012538080888191408, + "loss": 0.8194, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3877847545263509, + "learning_rate": 0.00012513000443139112, + "loss": 0.7216, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.3843877463228844, + "learning_rate": 0.00012487903113640337, + "loss": 0.7648, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3826732302875234, + "learning_rate": 0.00012462789068320017, + "loss": 0.7269, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.4174993947124962, + "learning_rate": 0.00012437658475915377, + "loss": 0.6984, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.3956102928696449, + "learning_rate": 0.00012412511505274844, + "loss": 0.6817, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.3855227847776443, + "learning_rate": 0.00012387348325356874, + "loss": 0.7379, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3870115797122608, + "learning_rate": 0.00012362169105228826, + "loss": 0.6653, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.4032790635846131, + "learning_rate": 0.00012336974014065844, + "loss": 0.7051, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.40870803711492304, + "learning_rate": 0.000123117632211497, + "loss": 0.6609, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.3836852385424524, + "learning_rate": 0.00012286536895867654, + "loss": 0.6716, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.39027238939245784, + "learning_rate": 0.00012261295207711346, + "loss": 0.7238, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.44435305582292883, + "learning_rate": 0.00012236038326275626, + "loss": 0.7842, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.36807764876205074, + "learning_rate": 0.0001221076642125742, + "loss": 0.7188, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.37974313056622594, + "learning_rate": 0.00012185479662454595, + "loss": 0.6753, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.49191848982158215, + "learning_rate": 0.00012160178219764837, + "loss": 0.7274, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.436705325672836, + "learning_rate": 0.00012134862263184467, + "loss": 0.805, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.4558997685379279, + "learning_rate": 0.00012109531962807332, + "loss": 0.7534, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.3614375682478202, + "learning_rate": 0.00012084187488823657, + "loss": 0.7331, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.40962537424818934, + "learning_rate": 0.00012058829011518896, + "loss": 0.7567, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.3717354299114109, + "learning_rate": 0.00012033456701272576, + "loss": 0.685, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.46433120094348523, + "learning_rate": 0.00012008070728557186, + "loss": 0.8213, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.3971642044552432, + "learning_rate": 0.00011982671263936995, + "loss": 0.7562, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.36437109433661913, + "learning_rate": 0.00011957258478066931, + "loss": 0.6947, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.4094693318616697, + "learning_rate": 0.00011931832541691418, + "loss": 0.7809, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.39973881274641615, + "learning_rate": 0.00011906393625643244, + "loss": 0.7591, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.43600619875021857, + "learning_rate": 0.00011880941900842397, + "loss": 0.7489, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.40584693339445527, + "learning_rate": 0.00011855477538294935, + "loss": 0.7284, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.38949820467031054, + "learning_rate": 0.00011830000709091815, + "loss": 0.7687, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.31714194772776355, + "learning_rate": 0.00011804511584407763, + "loss": 0.5904, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.4385282513881684, + "learning_rate": 0.0001177901033550012, + "loss": 0.7252, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.39049755420507537, + "learning_rate": 0.00011753497133707679, + "loss": 0.6847, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.40999065646712857, + "learning_rate": 0.00011727972150449544, + "loss": 0.771, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.3443179403111987, + "learning_rate": 0.00011702435557223987, + "loss": 0.6518, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.360005319545128, + "learning_rate": 0.00011676887525607271, + "loss": 0.6874, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.3660357434477541, + "learning_rate": 0.00011651328227252517, + "loss": 0.6991, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.3501226577092116, + "learning_rate": 0.00011625757833888551, + "loss": 0.6327, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.34940015104960626, + "learning_rate": 0.00011600176517318741, + "loss": 0.6964, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.3959512088553381, + "learning_rate": 0.0001157458444941984, + "loss": 0.7498, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.5000743050602681, + "learning_rate": 0.00011548981802140848, + "loss": 0.8117, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.45493454847087106, + "learning_rate": 0.00011523368747501839, + "loss": 0.7654, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4181817072560485, + "learning_rate": 0.00011497745457592816, + "loss": 0.7049, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.4993112369023617, + "learning_rate": 0.00011472112104572547, + "loss": 0.8586, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3688590762804925, + "learning_rate": 0.00011446468860667421, + "loss": 0.6746, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.4035478132113849, + "learning_rate": 0.0001142081589817027, + "loss": 0.722, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.5120096404712139, + "learning_rate": 0.00011395153389439233, + "loss": 0.8725, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.3943051369711696, + "learning_rate": 0.00011369481506896582, + "loss": 0.7279, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.49952239523844755, + "learning_rate": 0.00011343800423027582, + "loss": 0.806, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.4111541198803906, + "learning_rate": 0.00011318110310379301, + "loss": 0.7445, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.42379537377629345, + "learning_rate": 0.0001129241134155949, + "loss": 0.7724, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.3876066415267279, + "learning_rate": 0.00011266703689235394, + "loss": 0.733, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.38414077142798214, + "learning_rate": 0.00011240987526132594, + "loss": 0.7195, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.3961888164133105, + "learning_rate": 0.00011215263025033869, + "loss": 0.708, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.38639229096606376, + "learning_rate": 0.00011189530358778005, + "loss": 0.7574, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.486289906730662, + "learning_rate": 0.00011163789700258655, + "loss": 0.749, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.35051217769492726, + "learning_rate": 0.00011138041222423177, + "loss": 0.6801, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.3992297765587727, + "learning_rate": 0.00011112285098271451, + "loss": 0.7253, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.4191833397729842, + "learning_rate": 0.00011086521500854745, + "loss": 0.6575, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.45621005745711735, + "learning_rate": 0.00011060750603274535, + "loss": 0.7948, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.42468678684324007, + "learning_rate": 0.00011034972578681338, + "loss": 0.7157, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.43463087625405183, + "learning_rate": 0.00011009187600273566, + "loss": 0.7718, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.40796885202363853, + "learning_rate": 0.00010983395841296348, + "loss": 0.6686, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.42180210923844613, + "learning_rate": 0.00010957597475040373, + "loss": 0.7862, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3306181956675294, + "learning_rate": 0.00010931792674840718, + "loss": 0.6291, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.3986594575930178, + "learning_rate": 0.00010905981614075693, + "loss": 0.7029, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.37841030070189924, + "learning_rate": 0.00010880164466165674, + "loss": 0.6921, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.4456022887964803, + "learning_rate": 0.00010854341404571928, + "loss": 0.7539, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.36332341374759114, + "learning_rate": 0.00010828512602795462, + "loss": 0.7275, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.4088416033183619, + "learning_rate": 0.00010802678234375851, + "loss": 0.7169, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4489539684761865, + "learning_rate": 0.00010776838472890065, + "loss": 0.6952, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.37750215592470654, + "learning_rate": 0.0001075099349195131, + "loss": 0.6821, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.42414263824607873, + "learning_rate": 0.00010725143465207867, + "loss": 0.6966, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.4627681021566136, + "learning_rate": 0.00010699288566341914, + "loss": 0.7437, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.4296399308739044, + "learning_rate": 0.00010673428969068364, + "loss": 0.7833, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.39315532016308347, + "learning_rate": 0.000106475648471337, + "loss": 0.6944, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.32901112928582993, + "learning_rate": 0.00010621696374314807, + "loss": 0.636, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.44676630419360114, + "learning_rate": 0.00010595823724417795, + "loss": 0.8107, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3193413326727492, + "learning_rate": 0.00010569947071276847, + "loss": 0.5789, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.4383082208833308, + "learning_rate": 0.00010544066588753044, + "loss": 0.7389, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.37893839035584187, + "learning_rate": 0.00010518182450733186, + "loss": 0.7232, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.40385514455332266, + "learning_rate": 0.00010492294831128641, + "loss": 0.6872, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4107409175696198, + "learning_rate": 0.00010466403903874176, + "loss": 0.7025, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.45813590214281025, + "learning_rate": 0.00010440509842926767, + "loss": 0.7535, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.48500097796997715, + "learning_rate": 0.00010414612822264455, + "loss": 0.7116, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.4390511318468338, + "learning_rate": 0.00010388713015885161, + "loss": 0.7775, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.5744965923745033, + "learning_rate": 0.00010362810597805526, + "loss": 0.7349, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.35685145254456246, + "learning_rate": 0.00010336905742059742, + "loss": 0.6812, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4500760176534382, + "learning_rate": 0.0001031099862269837, + "loss": 0.7752, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.43381025518263794, + "learning_rate": 0.0001028508941378719, + "loss": 0.7205, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3743091354152876, + "learning_rate": 0.00010259178289406011, + "loss": 0.7308, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.35456208664101796, + "learning_rate": 0.00010233265423647523, + "loss": 0.7058, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3777273041080645, + "learning_rate": 0.00010207350990616107, + "loss": 0.6929, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.5627296053512927, + "learning_rate": 0.00010181435164426676, + "loss": 0.7176, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3882411951123953, + "learning_rate": 0.0001015551811920351, + "loss": 0.6935, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.4010106008367087, + "learning_rate": 0.00010129600029079072, + "loss": 0.7295, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.4286166627048035, + "learning_rate": 0.00010103681068192845, + "loss": 0.7521, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.38134404488493295, + "learning_rate": 0.00010077761410690172, + "loss": 0.6871, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.3649315007168492, + "learning_rate": 0.00010051841230721065, + "loss": 0.7501, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.32894272639704447, + "learning_rate": 0.00010025920702439051, + "loss": 0.6173, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3979670296084114, + "learning_rate": 0.0001, + "loss": 0.7092, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.375818746489365, + "learning_rate": 9.97407929756095e-05, + "loss": 0.6522, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3298020418260722, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6296, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.35704944917221043, + "learning_rate": 9.92223858930983e-05, + "loss": 0.6848, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3593459758999515, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6757, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.41017337430423023, + "learning_rate": 9.870399970920932e-05, + "loss": 0.7464, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.38870788914625504, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7182, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.36075187159462824, + "learning_rate": 9.818564835573323e-05, + "loss": 0.6606, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3465952192890516, + "learning_rate": 9.792649009383899e-05, + "loss": 0.657, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.3890310053274927, + "learning_rate": 9.766734576352478e-05, + "loss": 0.729, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.36891429507117385, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6528, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.3889508470707155, + "learning_rate": 9.714910586212816e-05, + "loss": 0.703, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3708482772935566, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7199, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.38630447929278516, + "learning_rate": 9.663094257940258e-05, + "loss": 0.6558, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4252677946241481, + "learning_rate": 9.637189402194476e-05, + "loss": 0.76, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.3904895795341803, + "learning_rate": 9.611286984114841e-05, + "loss": 0.7429, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.3903828145173097, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6662, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.4197535838547231, + "learning_rate": 9.559490157073236e-05, + "loss": 0.6786, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.39358250990389637, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6959, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.39934874751805954, + "learning_rate": 9.507705168871358e-05, + "loss": 0.7437, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.37851355446137275, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7198, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.45341797186688787, + "learning_rate": 9.455933411246958e-05, + "loss": 0.7767, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.36237706418665944, + "learning_rate": 9.430052928723153e-05, + "loss": 0.644, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.39774273457816467, + "learning_rate": 9.404176275582208e-05, + "loss": 0.6457, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.47254898712867793, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7675, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.5104606186884628, + "learning_rate": 9.352435152866298e-05, + "loss": 0.8824, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.42332695730487313, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7512, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.40240037498992387, + "learning_rate": 9.300711433658087e-05, + "loss": 0.7049, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4098457092214055, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7245, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.4872257879709029, + "learning_rate": 9.249006508048694e-05, + "loss": 0.6227, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3397579782310635, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6183, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.41727071576691904, + "learning_rate": 9.197321765624152e-05, + "loss": 0.757, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.37577272312505366, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6718, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.4036720264091577, + "learning_rate": 9.145658595428074e-05, + "loss": 0.7189, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.42896781846769505, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7073, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.47154385000572885, + "learning_rate": 9.09401838592431e-05, + "loss": 0.8136, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.38893628944026587, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6901, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.3815592852819167, + "learning_rate": 9.04240252495963e-05, + "loss": 0.7259, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.41294996353926877, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7351, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.4175120763011901, + "learning_rate": 8.990812399726435e-05, + "loss": 0.7051, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.46669499607969855, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7749, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.34556767002863736, + "learning_rate": 8.939249396725467e-05, + "loss": 0.6551, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4274465736455529, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7329, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.43355152213796083, + "learning_rate": 8.887714901728551e-05, + "loss": 0.7056, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.6048622245573282, + "learning_rate": 8.861958777576827e-05, + "loss": 0.794, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.3441757994502071, + "learning_rate": 8.836210299741346e-05, + "loss": 0.6646, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.42196882403388164, + "learning_rate": 8.810469641222001e-05, + "loss": 0.7324, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.4066753822516913, + "learning_rate": 8.784736974966135e-05, + "loss": 0.7334, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.40730272513897353, + "learning_rate": 8.759012473867407e-05, + "loss": 0.709, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.4081083754585546, + "learning_rate": 8.733296310764611e-05, + "loss": 0.7078, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.37441700960388963, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7076, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.4167855001509996, + "learning_rate": 8.6818896896207e-05, + "loss": 0.7298, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3893163424833126, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7387, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.39857255128153724, + "learning_rate": 8.63051849310342e-05, + "loss": 0.7165, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.4535351896839242, + "learning_rate": 8.604846610560771e-05, + "loss": 0.7973, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.4155102465200645, + "learning_rate": 8.579184101829734e-05, + "loss": 0.6877, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.38378824297310854, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6466, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.4113967300469492, + "learning_rate": 8.527887895427454e-05, + "loss": 0.748, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.39313027960520175, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7822, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.44822833663854883, + "learning_rate": 8.476631252498162e-05, + "loss": 0.7345, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.34454468305430036, + "learning_rate": 8.451018197859153e-05, + "loss": 0.677, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.3569000138406648, + "learning_rate": 8.425415550580162e-05, + "loss": 0.6789, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.4362006511176541, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6211, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.366006201500598, + "learning_rate": 8.374242166111448e-05, + "loss": 0.6798, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4232382038227556, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7744, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.371496901813501, + "learning_rate": 8.323112474392731e-05, + "loss": 0.6871, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.6731000117772543, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7085, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.39080670456360833, + "learning_rate": 8.272027849550457e-05, + "loss": 0.6256, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4117997518231274, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6984, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.4822700301622576, + "learning_rate": 8.220989664499878e-05, + "loss": 0.7917, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.41642009981384603, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6661, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.43828493263885393, + "learning_rate": 8.169999290908188e-05, + "loss": 0.7156, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4240648870455564, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7666, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.5280655669164235, + "learning_rate": 8.119058099157604e-05, + "loss": 0.8215, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.4314632075111001, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7157, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.42766372542648995, + "learning_rate": 8.068167458308582e-05, + "loss": 0.7356, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.3536232789675204, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6305, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.396787968011526, + "learning_rate": 8.017328736063006e-05, + "loss": 0.688, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.36622359112802105, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6104, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.4293433853098972, + "learning_rate": 7.966543298727425e-05, + "loss": 0.7291, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3980734993834797, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7039, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.3904293866446969, + "learning_rate": 7.915812511176347e-05, + "loss": 0.6716, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.36995115448433163, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6903, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.32560215936571923, + "learning_rate": 7.865137736815535e-05, + "loss": 0.6638, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4174288747054683, + "learning_rate": 7.839821780235168e-05, + "loss": 0.7212, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.3730951899363253, + "learning_rate": 7.814520337545406e-05, + "loss": 0.6651, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.4063251235311768, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7293, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.4499268916836471, + "learning_rate": 7.763961673724379e-05, + "loss": 0.7426, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.396633091460796, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6957, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.3578247361979313, + "learning_rate": 7.713463104132345e-05, + "loss": 0.6577, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.38166873777131227, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6496, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.4152385411084173, + "learning_rate": 7.663025985934158e-05, + "loss": 0.7373, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4578866385374338, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6814, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.3775144819294085, + "learning_rate": 7.61265167464313e-05, + "loss": 0.7132, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.37444458875836906, + "learning_rate": 7.587488494725157e-05, + "loss": 0.7106, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.40369298334943393, + "learning_rate": 7.562341524084623e-05, + "loss": 0.728, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.426758911971294, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7277, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.5001292797021929, + "learning_rate": 7.512096886359664e-05, + "loss": 0.6971, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3597006649006697, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6538, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.3568043116224071, + "learning_rate": 7.461919111808595e-05, + "loss": 0.7017, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3651169402407518, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6328, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.3844907448236401, + "learning_rate": 7.411809548974792e-05, + "loss": 0.7112, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.4043079489440896, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6742, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.4227127648656904, + "learning_rate": 7.361769544568425e-05, + "loss": 0.7626, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.44251139527084404, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7496, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.39264336976041503, + "learning_rate": 7.311800443430251e-05, + "loss": 0.7107, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.3854740044037131, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6725, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.37218277208281797, + "learning_rate": 7.26190358849548e-05, + "loss": 0.6449, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3725759274566575, + "learning_rate": 7.236982672491698e-05, + "loss": 0.654, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.41710753488610125, + "learning_rate": 7.212080320757695e-05, + "loss": 0.7009, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4616704403436593, + "learning_rate": 7.187196700608373e-05, + "loss": 0.7656, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.3793693671206012, + "learning_rate": 7.162331979232783e-05, + "loss": 0.6986, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.4365789904267567, + "learning_rate": 7.137486323692995e-05, + "loss": 0.723, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.33849276678841417, + "learning_rate": 7.112659900922976e-05, + "loss": 0.5828, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3770313352866923, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6679, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.35909341550316776, + "learning_rate": 7.06306542078091e-05, + "loss": 0.6794, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.4952730478303068, + "learning_rate": 7.038297696626206e-05, + "loss": 0.773, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.3779200278363108, + "learning_rate": 7.013549871673736e-05, + "loss": 0.6774, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.5152186295001627, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7447, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.4652336562509862, + "learning_rate": 6.964114584347316e-05, + "loss": 0.8401, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5523220933081036, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6601, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.4303701867318267, + "learning_rate": 6.914760887390452e-05, + "loss": 0.7262, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.37474398699674083, + "learning_rate": 6.890115049885994e-05, + "loss": 0.7258, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.35065173582230624, + "learning_rate": 6.865490107199181e-05, + "loss": 0.6654, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.36703219558742856, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6379, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.41869079450058233, + "learning_rate": 6.816303567941112e-05, + "loss": 0.7464, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.3979451147642651, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7428, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.3846429827012229, + "learning_rate": 6.767202591519875e-05, + "loss": 0.728, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.41960066504509924, + "learning_rate": 6.742684601840141e-05, + "loss": 0.6702, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.3770819093470327, + "learning_rate": 6.718188497539554e-05, + "loss": 0.6202, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.4206725247444838, + "learning_rate": 6.693714443203507e-05, + "loss": 0.7032, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.35434322028668563, + "learning_rate": 6.669262603269246e-05, + "loss": 0.591, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.40789924022628743, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6677, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.3381255774020731, + "learning_rate": 6.620426223607654e-05, + "loss": 0.6213, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.37479881796793857, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6631, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.3758333174731441, + "learning_rate": 6.571680671047749e-05, + "loss": 0.6406, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.42763347900484716, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7203, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.358013225260845, + "learning_rate": 6.523027255641493e-05, + "loss": 0.6402, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.44232850144520364, + "learning_rate": 6.498735508086093e-05, + "loss": 0.736, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.3868589738684474, + "learning_rate": 6.474467284964634e-05, + "loss": 0.6979, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.38519475468830017, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6938, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.38018019360013416, + "learning_rate": 6.426002064081565e-05, + "loss": 0.6368, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.5303594601394425, + "learning_rate": 6.40180539194999e-05, + "loss": 0.7593, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.3758840665843039, + "learning_rate": 6.377632895510248e-05, + "loss": 0.6798, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5330733022028751, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7707, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.4053420748203502, + "learning_rate": 6.329361079187199e-05, + "loss": 0.6961, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.4000744617505393, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7312, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.37461660955043896, + "learning_rate": 6.281187912432587e-05, + "loss": 0.6765, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3681625533583791, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6539, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.3985134606316111, + "learning_rate": 6.233114689915316e-05, + "loss": 0.6728, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.44143279098238797, + "learning_rate": 6.209115961596208e-05, + "loss": 0.743, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.7081456800521198, + "learning_rate": 6.18514270361827e-05, + "loss": 0.7892, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3933123238271958, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6715, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.3601641373087553, + "learning_rate": 6.13727324280358e-05, + "loss": 0.6267, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4689956926826061, + "learning_rate": 6.113377361594049e-05, + "loss": 0.7738, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.4792144464704637, + "learning_rate": 6.08950759397797e-05, + "loss": 0.7722, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.3742389485992206, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6807, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.3301905166954284, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.6268, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.35988480884110635, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6726, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.3744294865993888, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.6467, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.40255348248906203, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6907, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.43247292970458023, + "learning_rate": 5.946846342446214e-05, + "loss": 0.6995, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.36178749321741194, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6855, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.4368786163665237, + "learning_rate": 5.899508750327501e-05, + "loss": 0.6982, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.36341677587811755, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6613, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.43050183605813286, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.7721, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.32973117042968936, + "learning_rate": 5.828709387627218e-05, + "loss": 0.5817, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.4470699631633555, + "learning_rate": 5.80516544129337e-05, + "loss": 0.6889, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4608207968552782, + "learning_rate": 5.781649679379378e-05, + "loss": 0.8078, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.3593112975488266, + "learning_rate": 5.758162259883867e-05, + "loss": 0.6923, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3841678464380535, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6425, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.3828351995854355, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.681, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4607693007659546, + "learning_rate": 5.687871633031754e-05, + "loss": 0.7063, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.3750724520710695, + "learning_rate": 5.664499159372017e-05, + "loss": 0.63, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.4031151784274128, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7019, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.3820735547833116, + "learning_rate": 5.617841757494762e-05, + "loss": 0.6912, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.3794383491039805, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6505, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.40145334915032993, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.7471, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.36627442458854886, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6942, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.3894765216107813, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6787, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.38355286480633677, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7154, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.3589843005850102, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.6107, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.37208511941913724, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6659, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.36916335904670466, + "learning_rate": 5.432402360355615e-05, + "loss": 0.6546, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4353339417797532, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.7128, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.4005890631619237, + "learning_rate": 5.386346293357242e-05, + "loss": 0.7092, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.3756446420199134, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6778, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.37576922952546127, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.601, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.42520146435293865, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6656, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.39262051542575105, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.7484, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.37552559270813046, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6849, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.3695064574427169, + "learning_rate": 5.248926987065417e-05, + "loss": 0.6113, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4279487729932737, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6826, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.3938359588528079, + "learning_rate": 5.203374286747158e-05, + "loss": 0.6431, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.3590555518218238, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6433, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.4540490980629825, + "learning_rate": 5.15795049724435e-05, + "loss": 0.7745, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.3771570585049773, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6832, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.4415921845867145, + "learning_rate": 5.112656839335543e-05, + "loss": 0.7557, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.40741586224130755, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7413, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.4434471970193744, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.7126, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3894162484448366, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.627, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.3863866092532346, + "learning_rate": 5.022464783894744e-05, + "loss": 0.676, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.3776175059677852, + "learning_rate": 5.000000000000002e-05, + "loss": 0.7, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.38833751137731987, + "learning_rate": 4.977568810302432e-05, + "loss": 0.6657, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3643478291891471, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6462, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.3718531297835898, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.6207, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.37090002483660917, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6015, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.3818802887497591, + "learning_rate": 4.88818300430819e-05, + "loss": 0.7237, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.4216500531432546, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6384, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.43490214335818284, + "learning_rate": 4.843695574177737e-05, + "loss": 0.7342, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.3874365160955942, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6828, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.4101791944198361, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.6987, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4019633596180036, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6716, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.3881943766742739, + "learning_rate": 4.755137637685979e-05, + "loss": 0.682, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.37086431272395237, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6382, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.3229897376133167, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.5634, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.37195166028459176, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6126, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.35736672026268756, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.6428, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.39181342064986285, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6178, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.4622321102444061, + "learning_rate": 4.623360864173893e-05, + "loss": 0.7532, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3508420398934294, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6165, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.43584792026732694, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6993, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3865443765265882, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6686, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.44089726242809385, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.7401, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3636091967598959, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6735, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.45260151896253265, + "learning_rate": 4.492884557078688e-05, + "loss": 0.8166, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.3716433090985538, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6785, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.3490016723110263, + "learning_rate": 4.449686911058992e-05, + "loss": 0.6338, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.35261459639093856, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6912, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.430517813454168, + "learning_rate": 4.406638431438576e-05, + "loss": 0.7577, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.39080082714846687, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7045, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.3845683325648363, + "learning_rate": 4.36374027515878e-05, + "loss": 0.6631, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4011660221395093, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6698, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.3803430175217997, + "learning_rate": 4.320993595120969e-05, + "loss": 0.7293, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.4345641406916766, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.7021, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.37788514553335906, + "learning_rate": 4.278399540155536e-05, + "loss": 0.6335, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.3754341531521682, + "learning_rate": 4.257160104963696e-05, + "loss": 0.613, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.3643334891575628, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.6607, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3886671386423688, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6659, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.3968260849187156, + "learning_rate": 4.193673880223339e-05, + "loss": 0.654, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.4574918781588961, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7591, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.39583832461725227, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.6505, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3871856632764471, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7419, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.39802865534197646, + "learning_rate": 4.109572403415386e-05, + "loss": 0.6144, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4036792124871241, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6651, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.3629294435919223, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.6717, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.4223531830282377, + "learning_rate": 4.046911357233343e-05, + "loss": 0.702, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.3546390164764367, + "learning_rate": 4.026104150684835e-05, + "loss": 0.6737, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3606188495178436, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6783, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.467315941854401, + "learning_rate": 3.984610290059467e-05, + "loss": 0.7204, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3846970151375505, + "learning_rate": 3.963923914773187e-05, + "loss": 0.5743, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.37861455834306523, + "learning_rate": 3.943278094912946e-05, + "loss": 0.6426, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3596346935109103, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6619, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.36957034226288926, + "learning_rate": 3.902108676060937e-05, + "loss": 0.7005, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3847442596608294, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6349, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.37152537084018145, + "learning_rate": 3.861103139944449e-05, + "loss": 0.6265, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.4575634958571184, + "learning_rate": 3.840662172471315e-05, + "loss": 0.7508, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.38577413678044076, + "learning_rate": 3.820262588600074e-05, + "loss": 0.6927, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3302322356224181, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6325, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.39399513814722714, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.7318, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4375238247354526, + "learning_rate": 3.759313507817196e-05, + "loss": 0.7364, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.4012366402819657, + "learning_rate": 3.739080826174498e-05, + "loss": 0.6626, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4153527247575827, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6455, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.45562157771641276, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.6857, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.37143963398553415, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6559, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.35400310240819777, + "learning_rate": 3.658572115866541e-05, + "loss": 0.6326, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.4196101020601408, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6649, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.4271247927155786, + "learning_rate": 3.618572862711247e-05, + "loss": 0.7248, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.39936532676017306, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6407, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.40373812391774183, + "learning_rate": 3.578745112405083e-05, + "loss": 0.6659, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4227238458265484, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7585, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.36629473480527946, + "learning_rate": 3.539089935331294e-05, + "loss": 0.6324, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4150563120212974, + "learning_rate": 3.519327394983888e-05, + "loss": 0.735, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.38188336972420933, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.6507, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.42389403459769254, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7406, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.36331268006366385, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.6199, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.37595893886168213, + "learning_rate": 3.440713983000601e-05, + "loss": 0.615, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.44337545720228805, + "learning_rate": 3.421170477595419e-05, + "loss": 0.6587, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.4048425474160189, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7039, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.399459405268422, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.7177, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.438929791874048, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6438, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.32299495901286296, + "learning_rate": 3.34343978560367e-05, + "loss": 0.5864, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.41176721861809107, + "learning_rate": 3.324118597838464e-05, + "loss": 0.7035, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.41891098855382514, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.6946, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.34807605861789076, + "learning_rate": 3.285610914348332e-05, + "loss": 0.5932, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.39692972686466704, + "learning_rate": 3.266424677350346e-05, + "loss": 0.7343, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.44396979022684196, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7643, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.3504173730748221, + "learning_rate": 3.228188057393895e-05, + "loss": 0.6379, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3709172492987434, + "learning_rate": 3.209137931341143e-05, + "loss": 0.624, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.3551495218636917, + "learning_rate": 3.190133432000252e-05, + "loss": 0.6517, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.3610548339661083, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.686, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.35221470234389773, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.5881, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.47129598984211485, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.767, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.4281984592959207, + "learning_rate": 3.114574250902558e-05, + "loss": 0.6818, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.4136989144183822, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.7458, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.403398217159144, + "learning_rate": 3.077071725875116e-05, + "loss": 0.6889, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.3820759005938559, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6272, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.36206663440057796, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.6344, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.382037624579827, + "learning_rate": 3.021167106673928e-05, + "loss": 0.7231, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.4009265534647435, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.6901, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3668408443563656, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6501, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.46918634921474295, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.7383, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4011047795393638, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.7044, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.3978658551973164, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.6485, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.4311999483799842, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.7244, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.4405955409446885, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.8154, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.35148168463275686, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6329, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.4391708027774675, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.6859, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.401233479375333, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6705, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.38501925261003556, + "learning_rate": 2.819819423336775e-05, + "loss": 0.6618, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3824208693962986, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6804, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.36708580610823355, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.6485, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3908615234947648, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.7154, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.4450760968444355, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.7355, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.4244818859774434, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6767, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.38961087423927715, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6501, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.40128368555614596, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6782, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.3962240832210282, + "learning_rate": 2.677041764010988e-05, + "loss": 0.6899, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.3437743452274928, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6179, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.44070949897096634, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.7175, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3986166249950004, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6976, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.37466672276486496, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.6763, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.46823044700217803, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6756, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.4077060986771547, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.7142, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4096255017834381, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.7017, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.39291648839729004, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.6814, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.41934210837225766, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7685, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.42463297275213213, + "learning_rate": 2.503004759861258e-05, + "loss": 0.6305, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.39826267719605246, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6776, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.4201012844491468, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.6871, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.42423210152452784, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6828, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.3716904673400798, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.65, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.44715619118674044, + "learning_rate": 2.417867893002387e-05, + "loss": 0.7075, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.44812690231399344, + "learning_rate": 2.400992893100822e-05, + "loss": 0.6788, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4148743954758556, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6515, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.3815731146703102, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.7249, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.4428462685419307, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7174, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.3433519946274387, + "learning_rate": 2.334004587234717e-05, + "loss": 0.6333, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.44125548907997186, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.7504, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.36862297747718953, + "learning_rate": 2.300819024631603e-05, + "loss": 0.6269, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.37319845184821043, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6257, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.4427215035234154, + "learning_rate": 2.26784037992395e-05, + "loss": 0.753, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.37144786727059803, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6446, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.38097204617512126, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.6654, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.4225298229068004, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6565, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.3691817251407238, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.6551, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.37136941337706175, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.5964, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.37450244628290513, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.6865, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4130833114697751, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6977, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.44555395651971014, + "learning_rate": 2.138012622361689e-05, + "loss": 0.6377, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3968898425692245, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6254, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.3712169948573636, + "learning_rate": 2.106081749751897e-05, + "loss": 0.6343, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3865098872535555, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6337, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.43081481479411904, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.669, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.33824124335767314, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6073, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.3664290646462942, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.6355, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.4116124972227826, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6394, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.4214488547569396, + "learning_rate": 2.011565445123711e-05, + "loss": 0.6894, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.35041197043145234, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6128, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.3928012124823455, + "learning_rate": 1.980488270378612e-05, + "loss": 0.6737, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.41725282345429426, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.7083, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.3615156197633144, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.639, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.41426420962795724, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.693, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.3683560317023612, + "learning_rate": 1.918981330958678e-05, + "loss": 0.5815, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.41568039625658215, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6066, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.4006963112284083, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.6937, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.41018249261627265, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.717, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.4995323353787287, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.7592, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.43650575521391843, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6515, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.40163729148136174, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.6575, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.40080269999384505, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6775, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.3940333512523945, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.6355, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.42995740013190453, + "learning_rate": 1.783776873795994e-05, + "loss": 0.7174, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.4008589303162475, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.6797, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.43877207243888605, + "learning_rate": 1.754336106761927e-05, + "loss": 0.742, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.4143732694255588, + "learning_rate": 1.739698775823442e-05, + "loss": 0.6872, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.362122022640218, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6237, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.401006384524641, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.6287, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.41150942890868347, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6576, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.40850542982370197, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.6039, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.42762187690430686, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6763, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.38675675962303874, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.6441, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4138391754414263, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.7003, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.41573860167128435, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.6968, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.42718124101470617, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6816, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.3716399930215204, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.6321, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.469897313356421, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7003, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.4041837674527376, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.7056, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3657042641738609, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.5993, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.3993576544320269, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.6623, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.35331311245911873, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6126, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.3573893675271595, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.5988, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3709209680359646, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6527, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.41541374035461237, + "learning_rate": 1.485810737340767e-05, + "loss": 0.6688, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3764019643545416, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6141, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.4369468983893906, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.6751, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.38250970066817025, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6329, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.42672350476593035, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.6886, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.37009545183960346, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6003, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.37229548936007656, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.6348, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4549849078289334, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7643, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.37206131026446826, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.6873, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.38122123278652426, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6317, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.42787305571612866, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.7039, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.42531717697000854, + "learning_rate": 1.339745962155613e-05, + "loss": 0.714, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.42190425635710993, + "learning_rate": 1.326814704364262e-05, + "loss": 0.6994, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.3275707945531675, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.5978, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.4123729826288982, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.6265, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.38435160511624555, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6507, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.37267373640836915, + "learning_rate": 1.275673273546758e-05, + "loss": 0.6474, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.38302673161912193, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6811, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.4091845289678434, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.7164, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.40155070927607034, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6872, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.4194861227916186, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.6948, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.38925930416460935, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6353, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.3343519926674715, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.5882, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.37468882377998025, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6788, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.415944774962567, + "learning_rate": 1.176209418012495e-05, + "loss": 0.6272, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4249131063398309, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7145, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.41711218711666387, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.6209, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.4128837498379851, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6976, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.35925305553677817, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.665, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.35258149175832587, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6029, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.45046396204959727, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.693, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3669770913635565, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6507, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.35453880895924644, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.6464, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.3461360282849764, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6271, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.3385469110829669, + "learning_rate": 1.057219974130903e-05, + "loss": 0.6163, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.37768309956169943, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6733, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.43192174935318534, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.7395, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.3896447776958473, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6399, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.38248077399288255, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.6836, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3709794028821994, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6068, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.3904703563728366, + "learning_rate": 9.887052838721322e-06, + "loss": 0.6232, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.36606600947344253, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6544, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.43381197547874695, + "learning_rate": 9.663506046162985e-06, + "loss": 0.7176, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.39471345240167216, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6605, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.4315478752535828, + "learning_rate": 9.44238707511862e-06, + "loss": 0.6423, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.40013872266622436, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6943, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.387519395093711, + "learning_rate": 9.22370186822965e-06, + "loss": 0.6568, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.4340248915869381, + "learning_rate": 9.115273765538202e-06, + "loss": 0.659, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.40221129232474506, + "learning_rate": 9.0074563027294e-06, + "loss": 0.7269, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.40148875037197057, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6207, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.3509492771514673, + "learning_rate": 8.79365619028507e-06, + "loss": 0.6336, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.397875287389267, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6849, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.46386665430153035, + "learning_rate": 8.582307276841462e-06, + "loss": 0.7382, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.4056797818327132, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6658, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.38602393116264594, + "learning_rate": 8.37341524246672e-06, + "loss": 0.6476, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4828539366225798, + "learning_rate": 8.269892311900696e-06, + "loss": 0.7534, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.37718901184575493, + "learning_rate": 8.166985701199582e-06, + "loss": 0.6525, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3902132590453072, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6359, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.4060602940939635, + "learning_rate": 7.963024200898462e-06, + "loss": 0.6597, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.39823302264436056, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6407, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.3495785862432801, + "learning_rate": 7.761536223092458e-06, + "loss": 0.5887, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.37154597449786636, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6655, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.4258319717537172, + "learning_rate": 7.562527182833978e-06, + "loss": 0.6908, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.4059964137720068, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6829, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.4651857139614339, + "learning_rate": 7.366002428553153e-06, + "loss": 0.7124, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.4002873835289902, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.7299, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.4182887098863189, + "learning_rate": 7.171967241914224e-06, + "loss": 0.7299, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3958402531946444, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6843, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.42650265679061683, + "learning_rate": 6.980426837673437e-06, + "loss": 0.6642, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.36979311141228494, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.724, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.45611193851844184, + "learning_rate": 6.791386363539065e-06, + "loss": 0.6907, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.33588568532605667, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6131, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.4061854018710927, + "learning_rate": 6.604850900032955e-06, + "loss": 0.7133, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.38484916040022893, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6908, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.40279673459652093, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6769, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.35459493389580776, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6024, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.39348496439416963, + "learning_rate": 6.239314990243339e-06, + "loss": 0.6886, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.36160745975341585, + "learning_rate": 6.149504395842087e-06, + "loss": 0.5982, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.4170137122342651, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.6217, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4247995479288835, + "learning_rate": 5.971775505458444e-06, + "loss": 0.672, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.40112248826611274, + "learning_rate": 5.883858403607967e-06, + "loss": 0.6525, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.3830011434263009, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6717, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.3767053001757346, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.6285, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.4262070140089433, + "learning_rate": 5.623903547074549e-06, + "loss": 0.7382, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.42653574026738084, + "learning_rate": 5.538519351897575e-06, + "loss": 0.5835, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.3786988857351713, + "learning_rate": 5.453769828241872e-06, + "loss": 0.5964, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.4136283084118521, + "learning_rate": 5.369655545525909e-06, + "loss": 0.6574, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3824126838189303, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6647, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.41220862608650916, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.6328, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4120713215748317, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6746, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.3958022854447971, + "learning_rate": 5.039562062965508e-06, + "loss": 0.6618, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.40274399574312025, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6905, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.39693457693884204, + "learning_rate": 4.87834125814235e-06, + "loss": 0.6835, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.41969762965770036, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6685, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.3463381295579955, + "learning_rate": 4.719676877632639e-06, + "loss": 0.6296, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.37401139243286946, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6244, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.4246369519189044, + "learning_rate": 4.563573185591219e-06, + "loss": 0.7189, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.36601039318896783, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6796, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.3974605104680374, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.619, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.4110936231174561, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.7068, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.4177468932497064, + "learning_rate": 4.259064579323302e-06, + "loss": 0.6845, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.4264028114861248, + "learning_rate": 4.184544329761009e-06, + "loss": 0.706, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.40196424023510746, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.684, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.38155367513039906, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6821, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.43322231632362984, + "learning_rate": 3.964848174174541e-06, + "loss": 0.6933, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.38575462841035746, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6331, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.36988807284998965, + "learning_rate": 3.821609474213983e-06, + "loss": 0.6754, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.36894533488174114, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6117, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.3824389769632664, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.5828, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.4100882503431007, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6732, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.3939656464313838, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.6479, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3610380946052676, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6151, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.37176805505393795, + "learning_rate": 3.40741737109318e-06, + "loss": 0.6099, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.3726635307220839, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6585, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.36404188224864553, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.6664, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3785339085319431, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6647, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.4185980915310844, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.6689, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.4099574635267387, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6856, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.3818024637485815, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.6691, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.36279268301084633, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.642, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.38331837755282544, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.6509, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.369111445716187, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6361, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.4115678486699466, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6855, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.39808971107404223, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7295, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.46666506769343125, + "learning_rate": 2.649217248223468e-06, + "loss": 0.7466, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.42147508706907266, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6955, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.41894688199289515, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.7147, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.45675887190903264, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.7232, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.4522821555018652, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.7097, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.37444659689206355, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6347, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.42113978996222756, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.7053, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.39523331936067674, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6669, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.4137522101946141, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.6901, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.4325293674351783, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6585, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.42502219933304314, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.6946, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.4150688302051163, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6896, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.3872164353059273, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.6084, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.3972240093463755, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7117, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.41689310545481006, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.7383, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3442155424450381, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6012, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.38713774050685595, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.6523, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3944181865000051, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6382, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.38923644150853126, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.7153, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.3851152231100486, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6433, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.4070747814345844, + "learning_rate": 1.595161589389449e-06, + "loss": 0.6767, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.3907538104319003, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6534, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.37924046239724357, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.6908, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3913428678490897, + "learning_rate": 1.459798471131868e-06, + "loss": 0.662, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.39101169097306065, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.7022, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.39544336411685543, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7007, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.4367205135860217, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.7405, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4145368019549925, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.7002, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.35362366608330437, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.6433, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.40739311742148976, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6905, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.3450490486393698, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.6343, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.36672693593050265, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6555, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.40443821669536556, + "learning_rate": 1.089491988176017e-06, + "loss": 0.7021, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.36152910649068787, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6078, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.38822081969756744, + "learning_rate": 1.014505010326583e-06, + "loss": 0.5962, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4021822336397349, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6385, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.36888296373018903, + "learning_rate": 9.421782985976068e-07, + "loss": 0.6739, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.36602497625700325, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6636, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.37834634024893005, + "learning_rate": 8.725137967920738e-07, + "loss": 0.6394, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.41056503998473765, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7028, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.41934071567059505, + "learning_rate": 8.055133771652345e-07, + "loss": 0.6132, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3895531059623432, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6956, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.4123458423850561, + "learning_rate": 7.411788403743237e-07, + "loss": 0.658, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.36695705362492903, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6275, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.37582300821639975, + "learning_rate": 6.7951191543012e-07, + "loss": 0.5752, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.36945052731923683, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7127, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.41868711436006467, + "learning_rate": 6.205142596505176e-07, + "loss": 0.6849, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.43543255942755765, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6454, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.37590235442187986, + "learning_rate": 5.64187458615939e-07, + "loss": 0.6789, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.41731184653345194, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6704, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.4143146083508496, + "learning_rate": 5.105330261267916e-07, + "loss": 0.6373, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.3851611980362892, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6019, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.40780593086081135, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.6518, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4056988218108442, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.7135, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.3784078966028107, + "learning_rate": 4.112469628438365e-07, + "loss": 0.6387, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3637434465211071, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6167, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.3616300978607632, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.6263, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4129713945666034, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6191, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.35762528465376253, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.6434, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.36828474797230004, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6546, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.3957523047687968, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.6515, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.39174572878750363, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6431, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.3779064892067319, + "learning_rate": 2.448018893333681e-07, + "loss": 0.647, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.39824669482756286, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6405, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.37074336916815626, + "learning_rate": 2.098903854912515e-07, + "loss": 0.633, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.34004430054490165, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6539, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.37166414703450706, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.5827, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3917322050265033, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.651, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.3788898361910757, + "learning_rate": 1.481139151579991e-07, + "loss": 0.7005, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.32905755632467365, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.5832, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.4212142889676699, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.7139, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3506529790455762, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6374, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.40496802534144327, + "learning_rate": 9.707157531134713e-08, + "loss": 0.6205, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.38956207080771643, + "learning_rate": 8.598886661895788e-08, + "loss": 0.7307, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.409276659437291, + "learning_rate": 7.557746412468758e-08, + "loss": 0.6849, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.33857753778177363, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6282, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.35583207592596244, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6375, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.4278669236748669, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6559, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.35771404196607504, + "learning_rate": 4.064624751394242e-08, + "loss": 0.5723, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.3705260496346748, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6327, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.40411456405464913, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.6728, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.39034217032554636, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.653, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.3684099601229621, + "learning_rate": 1.646071422083395e-08, + "loss": 0.6704, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4601256797139572, + "learning_rate": 1.209367398504746e-08, + "loss": 0.7698, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.5029579074824774, + "learning_rate": 8.398436437317969e-09, + "loss": 0.7173, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.4367252519126284, + "learning_rate": 5.375026405352035e-09, + "loss": 0.7101, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.4054879695873238, + "learning_rate": 3.023464202944748e-09, + "loss": 0.7071, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3916101744254669, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6545, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.455720607139133, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.7416, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.3877743051460032, + "learning_rate": 0.0, + "loss": 0.6556, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1118125740523520.0, + "train_loss": 0.7323009983539581, + "train_runtime": 19639.1569, + "train_samples_per_second": 1.018, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1118125740523520.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a6863c943bd46bcce560929356f6b56d66f111f7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "up_proj", + "v_proj", + "o_proj", + "k_proj", + "q_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..436c9519a878d70d21e16a874d25a7d4a73d59bf --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:887965b3be290a805860a3539a31f509131a71329e1a3fa3ec063f825b1916d7 +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..95503a5f7e5192f4eb7000a3d6c7db23eda2ca16 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:132b2359c1af99b84c77dc4998a784e8c1c34f2ebf05bc9b2e9d8c9c86964125 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..97a0479a48d8d53e59fdb7fdce339ddd8974f0a5 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.8230102459917927, + "learning_rate": 5e-05, + "loss": 1.2514, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.7754613104842406, + "learning_rate": 0.0001, + "loss": 1.2817, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.7476538150253536, + "learning_rate": 0.00015000000000000001, + "loss": 1.3323, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.826743930439023, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 0.8815455745050119, + "learning_rate": 0.00019996629653035126, + "loss": 0.9763, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.6994265783297446, + "learning_rate": 0.00019986520883988232, + "loss": 0.9622, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.4844382211969728, + "learning_rate": 0.00019969680506871137, + "loss": 0.84, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.4831927243517946, + "learning_rate": 0.00019946119873266613, + "loss": 0.8755, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.5020615808420609, + "learning_rate": 0.00019915854864676664, + "loss": 0.8759, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.4491917358911475, + "learning_rate": 0.00019878905881817252, + "loss": 0.7923, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.5621541379413584, + "learning_rate": 0.00019835297830866826, + "loss": 0.8595, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.5229399877952087, + "learning_rate": 0.00019785060106677818, + "loss": 0.9081, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.4899483245891199, + "learning_rate": 0.00019728226572962473, + "loss": 0.8514, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.5248054422478008, + "learning_rate": 0.0001966483553946637, + "loss": 0.8841, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.4467654958381241, + "learning_rate": 0.00019594929736144976, + "loss": 0.8905, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.485282859570833, + "learning_rate": 0.00019518556284360696, + "loss": 0.9181, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.39458183468054975, + "learning_rate": 0.0001943576666511982, + "loss": 0.7556, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.4373416456058985, + "learning_rate": 0.0001934661668437073, + "loss": 0.8343, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.5027527747723598, + "learning_rate": 0.0001925116643538684, + "loss": 0.9187, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.4098235261127106, + "learning_rate": 0.00019149480258259533, + "loss": 0.8822, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.4295129059527011, + "learning_rate": 0.00019041626696528503, + "loss": 0.8656, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.3936690434586878, + "learning_rate": 0.0001892767845097864, + "loss": 0.8036, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.4417471186799036, + "learning_rate": 0.00018807712330634642, + "loss": 0.8928, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.4864785406175006, + "learning_rate": 0.0001868180920098644, + "loss": 0.9437, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.45968721046185873, + "learning_rate": 0.00018550053929480202, + "loss": 0.8372, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.4830714665889916, + "learning_rate": 0.00018412535328311814, + "loss": 0.8806, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.43737069265822076, + "learning_rate": 0.0001826934609456129, + "loss": 0.8653, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.49014680189737875, + "learning_rate": 0.00018120582747708502, + "loss": 0.9067, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.3889008209366986, + "learning_rate": 0.0001796634556457236, + "loss": 0.7482, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.46759884804944396, + "learning_rate": 0.0001780673851171728, + "loss": 0.8731, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.44882353456669494, + "learning_rate": 0.00017641869175372493, + "loss": 0.8668, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.48171393222739595, + "learning_rate": 0.00017471848688911464, + "loss": 0.8581, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.3903285974086322, + "learning_rate": 0.000172967916579403, + "loss": 0.7926, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.40684507206766596, + "learning_rate": 0.00017116816083045602, + "loss": 0.8027, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.4994530648174004, + "learning_rate": 0.0001693204328025389, + "loss": 0.8932, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.44581192002150094, + "learning_rate": 0.00016742597799256182, + "loss": 0.9286, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.5045891023767098, + "learning_rate": 0.00016548607339452853, + "loss": 0.7539, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.6705640902608652, + "learning_rate": 0.00016350202663875386, + "loss": 0.7617, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.42724800016082565, + "learning_rate": 0.0001614751751104301, + "loss": 0.7821, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.4693831019747195, + "learning_rate": 0.00015940688504813662, + "loss": 0.8945, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.4177089941624564, + "learning_rate": 0.00015729855062290022, + "loss": 0.7994, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.4545228930867604, + "learning_rate": 0.00015515159299842707, + "loss": 0.8277, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.40871977385442615, + "learning_rate": 0.00015296745937313987, + "loss": 0.8496, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.3950360210436592, + "learning_rate": 0.00015074762200466556, + "loss": 0.762, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.48892837124416993, + "learning_rate": 0.00014849357721743168, + "loss": 0.9717, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.40637100014340133, + "learning_rate": 0.00014620684439403962, + "loss": 0.7747, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.419754658262873, + "learning_rate": 0.0001438889649510956, + "loss": 0.8454, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.43116125499781977, + "learning_rate": 0.00014154150130018866, + "loss": 0.8547, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.45465691712353407, + "learning_rate": 0.00013916603579471705, + "loss": 0.8865, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.43305078198789515, + "learning_rate": 0.000136764169663272, + "loss": 0.7896, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.418300240118205, + "learning_rate": 0.00013433752193029886, + "loss": 0.7947, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.4063728927765339, + "learning_rate": 0.00013188772832476188, + "loss": 0.7833, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.434297863169188, + "learning_rate": 0.00012941644017754964, + "loss": 0.7386, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.3595960970194214, + "learning_rate": 0.00012692532330836346, + "loss": 0.7493, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.40644583570820914, + "learning_rate": 0.00012441605690283915, + "loss": 0.7941, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.45001523527084303, + "learning_rate": 0.0001218903323806595, + "loss": 0.8319, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.4453989829060359, + "learning_rate": 0.00011934985225541998, + "loss": 0.8097, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.5209708873974737, + "learning_rate": 0.00011679632898701649, + "loss": 0.8557, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.36837003736864576, + "learning_rate": 0.00011423148382732853, + "loss": 0.8406, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.42846222791516786, + "learning_rate": 0.00011165704565997593, + "loss": 0.8763, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.42477424040871764, + "learning_rate": 0.00010907474983493144, + "loss": 0.8633, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.37810519177436647, + "learning_rate": 0.0001064863369987743, + "loss": 0.7503, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.7574479348860013, + "learning_rate": 0.00010389355192137377, + "loss": 0.7541, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.4593511499288001, + "learning_rate": 0.0001012981423197931, + "loss": 0.8625, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.38782464238758474, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7631, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.376653667689996, + "learning_rate": 9.610644807862625e-05, + "loss": 0.7488, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.43937504286831613, + "learning_rate": 9.35136630012257e-05, + "loss": 0.7863, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.41651678253242863, + "learning_rate": 9.092525016506858e-05, + "loss": 0.8357, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.4016459673662941, + "learning_rate": 8.83429543400241e-05, + "loss": 0.7872, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.5064989487514656, + "learning_rate": 8.57685161726715e-05, + "loss": 0.8308, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.40645253693630223, + "learning_rate": 8.320367101298351e-05, + "loss": 0.7172, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.4730501998349042, + "learning_rate": 8.065014774458003e-05, + "loss": 0.9324, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.37087713139709355, + "learning_rate": 7.810966761934053e-05, + "loss": 0.768, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.3964332480626391, + "learning_rate": 7.558394309716088e-05, + "loss": 0.8189, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.7221770560206417, + "learning_rate": 7.307467669163655e-05, + "loss": 0.8414, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.4316521393843248, + "learning_rate": 7.058355982245037e-05, + "loss": 0.771, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.4084063585831575, + "learning_rate": 6.811227167523815e-05, + "loss": 0.814, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.3942071420392019, + "learning_rate": 6.566247806970119e-05, + "loss": 0.8124, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.44230839486402507, + "learning_rate": 6.323583033672799e-05, + "loss": 0.9178, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.3996441014502741, + "learning_rate": 6.083396420528298e-05, + "loss": 0.8134, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.43362126658735817, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7898, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.4031592034738205, + "learning_rate": 5.611103504890444e-05, + "loss": 0.8033, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.366707674078855, + "learning_rate": 5.379315560596038e-05, + "loss": 0.7775, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.3975638056137053, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.8359, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.4045827373258958, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.8101, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.4093349338659919, + "learning_rate": 4.703254062686017e-05, + "loss": 0.8367, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.36895176230401944, + "learning_rate": 4.484840700157295e-05, + "loss": 0.7753, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.41132089972959834, + "learning_rate": 4.270144937709981e-05, + "loss": 0.7604, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.350552813073951, + "learning_rate": 4.059311495186338e-05, + "loss": 0.7579, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.36441120795630105, + "learning_rate": 3.852482488956992e-05, + "loss": 0.8193, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.4088620411864537, + "learning_rate": 3.649797336124615e-05, + "loss": 0.8123, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.4482828532364913, + "learning_rate": 3.45139266054715e-05, + "loss": 0.8331, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.39655166223757243, + "learning_rate": 3.257402200743821e-05, + "loss": 0.8113, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.39906728803227215, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.8395, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.39806188010556676, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.8374, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.3392116012955209, + "learning_rate": 2.7032083420597e-05, + "loss": 0.7256, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.3846613142134544, + "learning_rate": 2.528151311088537e-05, + "loss": 0.8255, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.40662812774870566, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7375, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.432119270511682, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.8773, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.44158999212591304, + "learning_rate": 2.03365443542764e-05, + "loss": 0.8638, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.37440088499613716, + "learning_rate": 1.879417252291502e-05, + "loss": 0.7741, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.3866735548327976, + "learning_rate": 1.730653905438714e-05, + "loss": 0.8221, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.44310425673846804, + "learning_rate": 1.587464671688187e-05, + "loss": 0.7853, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.34743929892057196, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.6733, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.4703681600961128, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.9532, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.4348078777013694, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.7169, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.4145488466937236, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.8356, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.36617204461285024, + "learning_rate": 9.583733034714981e-06, + "loss": 0.7639, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.40029865160158434, + "learning_rate": 8.505197417404687e-06, + "loss": 0.7791, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.39878953165088726, + "learning_rate": 7.488335646131628e-06, + "loss": 0.8354, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.3921062999652053, + "learning_rate": 6.533833156292679e-06, + "loss": 0.7724, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.4499808155505365, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.8353, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.4388588621281375, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.8147, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.40645242005253257, + "learning_rate": 4.050702638550275e-06, + "loss": 0.7926, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.35812624610283694, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.7639, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.4594340329784737, + "learning_rate": 2.717734270375272e-06, + "loss": 0.755, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.39452619162037694, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.7649, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.4025940361974267, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.7408, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.3624064832750243, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.7472, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.404072865815922, + "learning_rate": 8.41451353233369e-07, + "loss": 0.8834, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.39761468715465376, + "learning_rate": 5.388012673338661e-07, + "loss": 0.7179, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.39029133459924825, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7815, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.36841909487954966, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7708, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.3834250542279903, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7298, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.4079464579421249, + "learning_rate": 0.0, + "loss": 0.7584, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 111708340486144.0, + "train_loss": 0.8341695852279664, + "train_runtime": 1961.2271, + "train_samples_per_second": 1.02, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 111708340486144.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..95e637174f4c6854bec1305e143fcc97e480d899 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "down_proj", + "gate_proj", + "k_proj", + "q_proj", + "o_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5dd2f56736a672d20e246768cd69181da7d17193 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:452c66bf0be31c51efdfe47a7179b943f633956899ffc5994464dd7ea7bcb263 +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..fc98fb6e60ae042032ed65563e1e4f42fb4c194d --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb4b39c3d0a288f2bfd5dafe524367a1fa916b14f42b8df2a5c55730e7cdaca6 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..64e5b5901f4ababbbe0da1b506cd9e2a5acb37b5 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.7988098439667438, + "learning_rate": 5e-05, + "loss": 1.2723, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.8047104939359582, + "learning_rate": 0.0001, + "loss": 1.3382, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.678779971008081, + "learning_rate": 0.00015000000000000001, + "loss": 1.2853, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.9622509769111144, + "learning_rate": 0.0002, + "loss": 1.1426, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 0.9449202770205999, + "learning_rate": 0.00019996629653035126, + "loss": 1.0317, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.6746967486634825, + "learning_rate": 0.00019986520883988232, + "loss": 0.953, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.6007273349994086, + "learning_rate": 0.00019969680506871137, + "loss": 1.0059, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.42193348083346716, + "learning_rate": 0.00019946119873266613, + "loss": 0.8801, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.5153618836742115, + "learning_rate": 0.00019915854864676664, + "loss": 0.973, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.48297466520195975, + "learning_rate": 0.00019878905881817252, + "loss": 0.8564, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.47564449264950726, + "learning_rate": 0.00019835297830866826, + "loss": 0.8737, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.5207808832264728, + "learning_rate": 0.00019785060106677818, + "loss": 0.9143, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.4538323052406747, + "learning_rate": 0.00019728226572962473, + "loss": 0.7725, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.5352076138143766, + "learning_rate": 0.0001966483553946637, + "loss": 0.8691, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.45615441780790505, + "learning_rate": 0.00019594929736144976, + "loss": 0.9811, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.42359513948647015, + "learning_rate": 0.00019518556284360696, + "loss": 0.8762, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.5052044531432168, + "learning_rate": 0.0001943576666511982, + "loss": 0.9138, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.4748034877827455, + "learning_rate": 0.0001934661668437073, + "loss": 0.8831, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.4319246964375862, + "learning_rate": 0.0001925116643538684, + "loss": 0.894, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.4295723902379643, + "learning_rate": 0.00019149480258259533, + "loss": 0.8507, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.43225796519679444, + "learning_rate": 0.00019041626696528503, + "loss": 0.8182, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.40490037014979546, + "learning_rate": 0.0001892767845097864, + "loss": 0.7561, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.46668780783282765, + "learning_rate": 0.00018807712330634642, + "loss": 0.8036, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.5157456248655427, + "learning_rate": 0.0001868180920098644, + "loss": 0.8313, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.5128106845064673, + "learning_rate": 0.00018550053929480202, + "loss": 0.9407, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.622506863485958, + "learning_rate": 0.00018412535328311814, + "loss": 0.9093, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.48229322437865646, + "learning_rate": 0.0001826934609456129, + "loss": 0.9261, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.4880628956946326, + "learning_rate": 0.00018120582747708502, + "loss": 0.8975, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.38460486088342766, + "learning_rate": 0.0001796634556457236, + "loss": 0.7645, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.4550248150103312, + "learning_rate": 0.0001780673851171728, + "loss": 0.8774, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.384620264631093, + "learning_rate": 0.00017641869175372493, + "loss": 0.8061, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.5225658909213516, + "learning_rate": 0.00017471848688911464, + "loss": 0.8908, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.4036288051125909, + "learning_rate": 0.000172967916579403, + "loss": 0.8343, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.42633925388376787, + "learning_rate": 0.00017116816083045602, + "loss": 0.8138, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.46806271680078293, + "learning_rate": 0.0001693204328025389, + "loss": 0.8776, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.422612239326831, + "learning_rate": 0.00016742597799256182, + "loss": 0.8444, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.4388054628355334, + "learning_rate": 0.00016548607339452853, + "loss": 0.8609, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.45385481596187843, + "learning_rate": 0.00016350202663875386, + "loss": 0.7595, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.43686390157927096, + "learning_rate": 0.0001614751751104301, + "loss": 0.8916, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.4081204005411219, + "learning_rate": 0.00015940688504813662, + "loss": 0.8302, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.5653417246912735, + "learning_rate": 0.00015729855062290022, + "loss": 0.908, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.39165937516782645, + "learning_rate": 0.00015515159299842707, + "loss": 0.7948, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.42135665615054524, + "learning_rate": 0.00015296745937313987, + "loss": 0.7923, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.39143282192422507, + "learning_rate": 0.00015074762200466556, + "loss": 0.8057, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.44759789583787546, + "learning_rate": 0.00014849357721743168, + "loss": 0.954, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.43133884534448624, + "learning_rate": 0.00014620684439403962, + "loss": 0.8549, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.4415959165799372, + "learning_rate": 0.0001438889649510956, + "loss": 0.8775, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.4514681186674009, + "learning_rate": 0.00014154150130018866, + "loss": 0.8194, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.4173767895604976, + "learning_rate": 0.00013916603579471705, + "loss": 0.8604, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.4133769218943885, + "learning_rate": 0.000136764169663272, + "loss": 0.7908, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.41828754301799737, + "learning_rate": 0.00013433752193029886, + "loss": 0.8546, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.406320541980502, + "learning_rate": 0.00013188772832476188, + "loss": 0.7896, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.41489293486425927, + "learning_rate": 0.00012941644017754964, + "loss": 0.7973, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.4677216400238403, + "learning_rate": 0.00012692532330836346, + "loss": 0.8564, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.43476535746613176, + "learning_rate": 0.00012441605690283915, + "loss": 0.8244, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.4277565054201037, + "learning_rate": 0.0001218903323806595, + "loss": 0.7799, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.4389903142149982, + "learning_rate": 0.00011934985225541998, + "loss": 0.8338, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.41672364775171566, + "learning_rate": 0.00011679632898701649, + "loss": 0.8323, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.4205986930643128, + "learning_rate": 0.00011423148382732853, + "loss": 0.8028, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.4213889474829399, + "learning_rate": 0.00011165704565997593, + "loss": 0.8655, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.38890226375386644, + "learning_rate": 0.00010907474983493144, + "loss": 0.7673, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.4007064239374529, + "learning_rate": 0.0001064863369987743, + "loss": 0.7581, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.40113808394171746, + "learning_rate": 0.00010389355192137377, + "loss": 0.8179, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.4701690819238131, + "learning_rate": 0.0001012981423197931, + "loss": 0.8274, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.3782865063416312, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7886, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.3695210524444637, + "learning_rate": 9.610644807862625e-05, + "loss": 0.773, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.5672358594074709, + "learning_rate": 9.35136630012257e-05, + "loss": 0.8099, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.4444450552640662, + "learning_rate": 9.092525016506858e-05, + "loss": 0.8306, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.41008705046493193, + "learning_rate": 8.83429543400241e-05, + "loss": 0.8789, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.4673737229398386, + "learning_rate": 8.57685161726715e-05, + "loss": 0.9351, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.3830397811715073, + "learning_rate": 8.320367101298351e-05, + "loss": 0.7336, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.4456941609698785, + "learning_rate": 8.065014774458003e-05, + "loss": 0.8714, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.40560461532518255, + "learning_rate": 7.810966761934053e-05, + "loss": 0.8273, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.37433262348405216, + "learning_rate": 7.558394309716088e-05, + "loss": 0.8253, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.5367018947959953, + "learning_rate": 7.307467669163655e-05, + "loss": 0.9215, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.405723420826382, + "learning_rate": 7.058355982245037e-05, + "loss": 0.7731, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.42171087090867204, + "learning_rate": 6.811227167523815e-05, + "loss": 0.7812, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.36691854390246237, + "learning_rate": 6.566247806970119e-05, + "loss": 0.7964, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.5249708367020518, + "learning_rate": 6.323583033672799e-05, + "loss": 0.9663, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.40363141580240425, + "learning_rate": 6.083396420528298e-05, + "loss": 0.7888, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.40789371199957575, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7985, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.3948410406597383, + "learning_rate": 5.611103504890444e-05, + "loss": 0.7705, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.4223952800047747, + "learning_rate": 5.379315560596038e-05, + "loss": 0.826, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.3972599652479266, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.7778, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.3995654426311221, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.8366, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.40162821542256544, + "learning_rate": 4.703254062686017e-05, + "loss": 0.8117, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.39073027474543864, + "learning_rate": 4.484840700157295e-05, + "loss": 0.8034, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.43886821003145177, + "learning_rate": 4.270144937709981e-05, + "loss": 0.8382, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.3269490532820274, + "learning_rate": 4.059311495186338e-05, + "loss": 0.6712, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.3968517644888277, + "learning_rate": 3.852482488956992e-05, + "loss": 0.7962, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.4341010529888357, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7879, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.42299890452348515, + "learning_rate": 3.45139266054715e-05, + "loss": 0.8825, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.39337741550250666, + "learning_rate": 3.257402200743821e-05, + "loss": 0.8028, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.39484171624919984, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.8119, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.40769681915213646, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.8004, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.3572914078439355, + "learning_rate": 2.7032083420597e-05, + "loss": 0.6857, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.3957369927951531, + "learning_rate": 2.528151311088537e-05, + "loss": 0.7213, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.37046745911855067, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7468, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.5005726568036697, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7871, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.3983543612467786, + "learning_rate": 2.03365443542764e-05, + "loss": 0.8407, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.48341697939974004, + "learning_rate": 1.879417252291502e-05, + "loss": 0.8513, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.3992372067430147, + "learning_rate": 1.730653905438714e-05, + "loss": 0.7209, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.41004749101059196, + "learning_rate": 1.587464671688187e-05, + "loss": 0.7662, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.370929509581193, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.7471, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.46262922759234665, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.8762, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.3589231369999426, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.7501, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.41915549604282365, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7917, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.3706848594244108, + "learning_rate": 9.583733034714981e-06, + "loss": 0.7624, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.3984204104574697, + "learning_rate": 8.505197417404687e-06, + "loss": 0.8092, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.6132336734307566, + "learning_rate": 7.488335646131628e-06, + "loss": 0.8505, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.4104622166667177, + "learning_rate": 6.533833156292679e-06, + "loss": 0.7983, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.3789263447274554, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.7745, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.3968337298933055, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.7523, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.42083234830591265, + "learning_rate": 4.050702638550275e-06, + "loss": 0.7649, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.3605611194789269, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.7152, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.36646932666512877, + "learning_rate": 2.717734270375272e-06, + "loss": 0.715, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.39664518944410565, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.7701, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.4198410480684532, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.8049, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.39925709056908043, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.7778, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.4297078774434034, + "learning_rate": 8.41451353233369e-07, + "loss": 0.8153, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.33826156234226706, + "learning_rate": 5.388012673338661e-07, + "loss": 0.716, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.40010593332292316, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.8225, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.362422594612195, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7373, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.39019640539389827, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7307, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.4127359643945258, + "learning_rate": 0.0, + "loss": 0.7566, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 113027115089920.0, + "train_loss": 0.8387285661697388, + "train_runtime": 1973.4661, + "train_samples_per_second": 1.013, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 113027115089920.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7b466a38ac92462575b1e20c7147b9b4c32e8e7d --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "down_proj", + "up_proj", + "gate_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bd60eae6dec5fe85b046b0be52c1edcf7d2925aa --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b2b1a45abe87793a81bcd2f5ef3b7ef69557f7cec3b632c7ca42150a716cfbf +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..32826a42ba27bc43e432f320a699d2e98d34b7e9 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bbeac56070177880cd170be34cdf7e19898867aecc03254b9931644a626b2ee +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..82faf447591a741394bf544bb4c48ee60d597aa6 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_2000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.8142887741739527, + "learning_rate": 5e-05, + "loss": 1.3203, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.826263044163555, + "learning_rate": 0.0001, + "loss": 1.357, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.664560860527008, + "learning_rate": 0.00015000000000000001, + "loss": 1.3177, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.8465688159037528, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 0.7328564816107398, + "learning_rate": 0.00019996629653035126, + "loss": 0.9531, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.7074016008967685, + "learning_rate": 0.00019986520883988232, + "loss": 1.0562, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.538099365465927, + "learning_rate": 0.00019969680506871137, + "loss": 0.9808, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.5176555636039095, + "learning_rate": 0.00019946119873266613, + "loss": 0.9657, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.5115786009392326, + "learning_rate": 0.00019915854864676664, + "loss": 0.9823, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.49782091794630234, + "learning_rate": 0.00019878905881817252, + "loss": 0.9706, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.5285171786772587, + "learning_rate": 0.00019835297830866826, + "loss": 0.9181, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.5045275115012817, + "learning_rate": 0.00019785060106677818, + "loss": 0.8479, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.45816614229029784, + "learning_rate": 0.00019728226572962473, + "loss": 0.8748, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.5694700429666423, + "learning_rate": 0.0001966483553946637, + "loss": 0.9133, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.49181239939969823, + "learning_rate": 0.00019594929736144976, + "loss": 0.9074, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.49316688543215115, + "learning_rate": 0.00019518556284360696, + "loss": 0.9717, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.44110871094005183, + "learning_rate": 0.0001943576666511982, + "loss": 0.8412, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.4418895277810478, + "learning_rate": 0.0001934661668437073, + "loss": 0.8382, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.42911875816940365, + "learning_rate": 0.0001925116643538684, + "loss": 0.8697, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.44258681052455573, + "learning_rate": 0.00019149480258259533, + "loss": 0.8645, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.39434812507866013, + "learning_rate": 0.00019041626696528503, + "loss": 0.8074, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.38627121086070065, + "learning_rate": 0.0001892767845097864, + "loss": 0.8043, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.42792218348914973, + "learning_rate": 0.00018807712330634642, + "loss": 0.8948, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.46226754180037904, + "learning_rate": 0.0001868180920098644, + "loss": 0.8565, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.5219612120339278, + "learning_rate": 0.00018550053929480202, + "loss": 0.9146, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.51761892515566, + "learning_rate": 0.00018412535328311814, + "loss": 0.9529, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.44024527482066445, + "learning_rate": 0.0001826934609456129, + "loss": 0.8762, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.49535408012521337, + "learning_rate": 0.00018120582747708502, + "loss": 0.8785, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.3771750038472647, + "learning_rate": 0.0001796634556457236, + "loss": 0.774, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.45040072849609114, + "learning_rate": 0.0001780673851171728, + "loss": 0.8503, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.36341393297332764, + "learning_rate": 0.00017641869175372493, + "loss": 0.7418, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.4216230059515405, + "learning_rate": 0.00017471848688911464, + "loss": 0.8598, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.4128495288664931, + "learning_rate": 0.000172967916579403, + "loss": 0.8267, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.3861751233140676, + "learning_rate": 0.00017116816083045602, + "loss": 0.8195, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.40977429983509844, + "learning_rate": 0.0001693204328025389, + "loss": 0.8812, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.4482435012846371, + "learning_rate": 0.00016742597799256182, + "loss": 0.8215, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.4083717573154022, + "learning_rate": 0.00016548607339452853, + "loss": 0.742, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.38613912307658893, + "learning_rate": 0.00016350202663875386, + "loss": 0.7913, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.3878615972435959, + "learning_rate": 0.0001614751751104301, + "loss": 0.7844, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.42355041709215796, + "learning_rate": 0.00015940688504813662, + "loss": 0.8898, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.43551615637754165, + "learning_rate": 0.00015729855062290022, + "loss": 0.8392, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.47496150335072046, + "learning_rate": 0.00015515159299842707, + "loss": 0.7912, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.388663124549581, + "learning_rate": 0.00015296745937313987, + "loss": 0.7692, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.4296499380531653, + "learning_rate": 0.00015074762200466556, + "loss": 0.7978, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.4649418059917025, + "learning_rate": 0.00014849357721743168, + "loss": 0.8377, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.4241895774181597, + "learning_rate": 0.00014620684439403962, + "loss": 0.7789, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.4375090382766959, + "learning_rate": 0.0001438889649510956, + "loss": 0.8476, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.41651911410568904, + "learning_rate": 0.00014154150130018866, + "loss": 0.7997, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.8385320653576406, + "learning_rate": 0.00013916603579471705, + "loss": 0.8468, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.4300807980092393, + "learning_rate": 0.000136764169663272, + "loss": 0.7734, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.4203570601266618, + "learning_rate": 0.00013433752193029886, + "loss": 0.839, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.39408737233710106, + "learning_rate": 0.00013188772832476188, + "loss": 0.7536, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.4186307599472403, + "learning_rate": 0.00012941644017754964, + "loss": 0.8182, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.40651416697280657, + "learning_rate": 0.00012692532330836346, + "loss": 0.8318, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.42229434810654476, + "learning_rate": 0.00012441605690283915, + "loss": 0.8055, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.4358531581187814, + "learning_rate": 0.0001218903323806595, + "loss": 0.8357, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.40822046618978647, + "learning_rate": 0.00011934985225541998, + "loss": 0.8125, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.4235880476040138, + "learning_rate": 0.00011679632898701649, + "loss": 0.8259, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.4215673331393117, + "learning_rate": 0.00011423148382732853, + "loss": 0.7784, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.4332060167133324, + "learning_rate": 0.00011165704565997593, + "loss": 0.8417, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.42219695345437325, + "learning_rate": 0.00010907474983493144, + "loss": 0.8155, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.4668267962773362, + "learning_rate": 0.0001064863369987743, + "loss": 0.7763, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.46272012154033837, + "learning_rate": 0.00010389355192137377, + "loss": 0.729, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.4100359662106381, + "learning_rate": 0.0001012981423197931, + "loss": 0.8168, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.3707040323194465, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7297, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.40864796522416713, + "learning_rate": 9.610644807862625e-05, + "loss": 0.8137, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.3802310662705543, + "learning_rate": 9.35136630012257e-05, + "loss": 0.7679, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.4023556002029965, + "learning_rate": 9.092525016506858e-05, + "loss": 0.7852, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.3809774110383497, + "learning_rate": 8.83429543400241e-05, + "loss": 0.7833, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.45986536929396077, + "learning_rate": 8.57685161726715e-05, + "loss": 0.8506, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.41836206851235846, + "learning_rate": 8.320367101298351e-05, + "loss": 0.7764, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.4933876256783, + "learning_rate": 8.065014774458003e-05, + "loss": 0.9205, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.37665806383923467, + "learning_rate": 7.810966761934053e-05, + "loss": 0.8073, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.4036540744178778, + "learning_rate": 7.558394309716088e-05, + "loss": 0.8794, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.4697773652413909, + "learning_rate": 7.307467669163655e-05, + "loss": 0.9005, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.41368973764806866, + "learning_rate": 7.058355982245037e-05, + "loss": 0.7326, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.43645348493034947, + "learning_rate": 6.811227167523815e-05, + "loss": 0.8351, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.35867972751751565, + "learning_rate": 6.566247806970119e-05, + "loss": 0.771, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.44441133948036937, + "learning_rate": 6.323583033672799e-05, + "loss": 0.8379, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.38343098317284874, + "learning_rate": 6.083396420528298e-05, + "loss": 0.8411, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.35211736255650594, + "learning_rate": 5.845849869981137e-05, + "loss": 0.725, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.4100351022946677, + "learning_rate": 5.611103504890444e-05, + "loss": 0.8126, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.38466666766475, + "learning_rate": 5.379315560596038e-05, + "loss": 0.791, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.3900399017720507, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.7693, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.40836104144601987, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.7799, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.40113863945212963, + "learning_rate": 4.703254062686017e-05, + "loss": 0.8244, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.44475610746699673, + "learning_rate": 4.484840700157295e-05, + "loss": 0.774, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.42967156563682585, + "learning_rate": 4.270144937709981e-05, + "loss": 0.8647, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.3496551633293636, + "learning_rate": 4.059311495186338e-05, + "loss": 0.7044, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.44015822119884673, + "learning_rate": 3.852482488956992e-05, + "loss": 0.8245, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.3942139634871655, + "learning_rate": 3.649797336124615e-05, + "loss": 0.8019, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.45959491694921717, + "learning_rate": 3.45139266054715e-05, + "loss": 0.9091, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.4022454352542581, + "learning_rate": 3.257402200743821e-05, + "loss": 0.795, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.447424677957686, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.8613, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.8228159028498044, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.78, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.40354773729170573, + "learning_rate": 2.7032083420597e-05, + "loss": 0.8328, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 1.2108866556094844, + "learning_rate": 2.528151311088537e-05, + "loss": 0.7878, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.3610193731209496, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7414, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.4567058715426913, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.8132, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.5064357508113733, + "learning_rate": 2.03365443542764e-05, + "loss": 0.8063, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.3638760992762728, + "learning_rate": 1.879417252291502e-05, + "loss": 0.7419, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.40459816038129287, + "learning_rate": 1.730653905438714e-05, + "loss": 0.7848, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.4336878568274014, + "learning_rate": 1.587464671688187e-05, + "loss": 0.8357, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.37345618595382063, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.7668, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.49049743228491344, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.9038, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.3759596761592932, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.6944, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.4116623409989959, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7785, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.5133833408468815, + "learning_rate": 9.583733034714981e-06, + "loss": 0.7874, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.39965353210976334, + "learning_rate": 8.505197417404687e-06, + "loss": 0.7695, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.43476499922924694, + "learning_rate": 7.488335646131628e-06, + "loss": 0.8003, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.363404507736322, + "learning_rate": 6.533833156292679e-06, + "loss": 0.7537, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.38207424823187824, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.7604, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.40645940426463006, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.8689, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.38990802657425977, + "learning_rate": 4.050702638550275e-06, + "loss": 0.8056, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.3606791147269738, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.7541, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.3757264538044718, + "learning_rate": 2.717734270375272e-06, + "loss": 0.7306, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.371466212528868, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.7826, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.3687310415085809, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.7713, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.4569668048302852, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.8099, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.4528811913299572, + "learning_rate": 8.41451353233369e-07, + "loss": 0.8434, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.3577966107509443, + "learning_rate": 5.388012673338661e-07, + "loss": 0.7936, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.4989489821260528, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.8581, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.38398524622969127, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7302, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.32350981307906573, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7048, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.41093063999025503, + "learning_rate": 0.0, + "loss": 0.8109, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 112252921184256.0, + "train_loss": 0.8373198323249816, + "train_runtime": 1963.4796, + "train_samples_per_second": 1.019, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 112252921184256.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6785ac69d5647fb96903c1ba7432ad49dcaa6fed --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "up_proj", + "q_proj", + "o_proj", + "k_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f9ee08b523df0dbe39ad4124231fb5c7f8d786af --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:568dc13a0488d7d30be058a2c0dd7311dd566c7e687ce101b610ba27a191de8c +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..882444ad5675293ffaade1cce09d35c5f089fe4d --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:932a3bcb850e75ed7733b784c557c1300e684bc602034c283d60188fe074fd64 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a57f5ec125f99908cbae5f0de166dd02271e52d4 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.7614603133138718, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.2054, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 0.9127747104502665, + "learning_rate": 7.017543859649123e-06, + "loss": 1.4222, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 0.839285541892957, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4296, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.7672733099016094, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.2582, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.6387348905114876, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.1707, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.7582928278378311, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3103, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.7192602920897577, + "learning_rate": 2.456140350877193e-05, + "loss": 1.2863, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.6167677211414274, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.1186, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.6996031262167461, + "learning_rate": 3.157894736842105e-05, + "loss": 1.0992, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.6042192060485368, + "learning_rate": 3.508771929824561e-05, + "loss": 0.9973, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.9060558121505383, + "learning_rate": 3.859649122807018e-05, + "loss": 1.2129, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8495686330600954, + "learning_rate": 4.210526315789474e-05, + "loss": 1.0312, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.7074201545607064, + "learning_rate": 4.56140350877193e-05, + "loss": 1.0136, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 1.1788839348627669, + "learning_rate": 4.912280701754386e-05, + "loss": 1.0729, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 0.6364794656570034, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.9574, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.5681338682121225, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.9741, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.6130526671292817, + "learning_rate": 5.9649122807017544e-05, + "loss": 0.9423, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.5563517522802279, + "learning_rate": 6.31578947368421e-05, + "loss": 0.893, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.6013801587529852, + "learning_rate": 6.666666666666667e-05, + "loss": 0.9249, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.5865005378899556, + "learning_rate": 7.017543859649122e-05, + "loss": 0.924, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5174851167888802, + "learning_rate": 7.368421052631579e-05, + "loss": 0.8749, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.5461881416444898, + "learning_rate": 7.719298245614036e-05, + "loss": 0.9422, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.4747564234230771, + "learning_rate": 8.070175438596491e-05, + "loss": 0.8907, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.43288426326437696, + "learning_rate": 8.421052631578948e-05, + "loss": 0.837, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.4962669711011486, + "learning_rate": 8.771929824561403e-05, + "loss": 0.8915, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.5901579206727451, + "learning_rate": 9.12280701754386e-05, + "loss": 1.0372, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.46045329393216505, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8052, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.4567183595792635, + "learning_rate": 9.824561403508771e-05, + "loss": 0.822, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5127915079330957, + "learning_rate": 0.0001017543859649123, + "loss": 0.8611, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.45212541128910144, + "learning_rate": 0.00010526315789473685, + "loss": 0.9031, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.4301450894070365, + "learning_rate": 0.00010877192982456141, + "loss": 0.8078, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.5866260600585053, + "learning_rate": 0.00011228070175438597, + "loss": 0.9865, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.4862368145091248, + "learning_rate": 0.00011578947368421053, + "loss": 0.8985, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.5316394614125118, + "learning_rate": 0.00011929824561403509, + "loss": 0.7971, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.4837818428894648, + "learning_rate": 0.00012280701754385965, + "loss": 0.8283, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.47387291685763033, + "learning_rate": 0.0001263157894736842, + "loss": 0.8943, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.527997166673022, + "learning_rate": 0.0001298245614035088, + "loss": 0.906, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.48495165100397924, + "learning_rate": 0.00013333333333333334, + "loss": 0.8542, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.48161554925204597, + "learning_rate": 0.0001368421052631579, + "loss": 0.8485, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.48539144350667346, + "learning_rate": 0.00014035087719298245, + "loss": 0.944, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.505239821205383, + "learning_rate": 0.00014385964912280703, + "loss": 0.867, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.46348589602511586, + "learning_rate": 0.00014736842105263158, + "loss": 0.8555, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.4527428066313504, + "learning_rate": 0.00015087719298245616, + "loss": 0.8377, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.4624001837932464, + "learning_rate": 0.0001543859649122807, + "loss": 0.9011, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.49908799700671475, + "learning_rate": 0.00015789473684210527, + "loss": 0.8382, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.4355107141235247, + "learning_rate": 0.00016140350877192982, + "loss": 0.7862, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.4386913869679193, + "learning_rate": 0.0001649122807017544, + "loss": 0.8395, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4541431792135339, + "learning_rate": 0.00016842105263157895, + "loss": 0.8412, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.46651404808286145, + "learning_rate": 0.00017192982456140353, + "loss": 0.8349, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.5648871754628285, + "learning_rate": 0.00017543859649122806, + "loss": 0.9897, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.4384363421914036, + "learning_rate": 0.00017894736842105264, + "loss": 0.8179, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.45413202194745417, + "learning_rate": 0.0001824561403508772, + "loss": 0.7779, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.4857043747858801, + "learning_rate": 0.00018596491228070177, + "loss": 0.8535, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4719582644464707, + "learning_rate": 0.00018947368421052632, + "loss": 0.8999, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.42847542026076446, + "learning_rate": 0.00019298245614035088, + "loss": 0.7584, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.5295530041959768, + "learning_rate": 0.00019649122807017543, + "loss": 0.8573, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.49939356616520253, + "learning_rate": 0.0002, + "loss": 0.9823, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.4588501178491851, + "learning_rate": 0.00019999985069241055, + "loss": 0.8502, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.44087109587454565, + "learning_rate": 0.00019999940277008808, + "loss": 0.7886, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.4622585621540621, + "learning_rate": 0.00019999865623437013, + "loss": 0.8248, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.4436021478832358, + "learning_rate": 0.00019999761108748597, + "loss": 0.7526, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.7233960848577827, + "learning_rate": 0.00019999626733255662, + "loss": 0.8258, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.3961220197860444, + "learning_rate": 0.00019999462497359466, + "loss": 0.7528, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.4814306656556086, + "learning_rate": 0.00019999268401550447, + "loss": 0.8285, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.47322986486209684, + "learning_rate": 0.000199990444464082, + "loss": 0.901, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.38116839724000634, + "learning_rate": 0.00019998790632601496, + "loss": 0.7233, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.4660356984106766, + "learning_rate": 0.00019998506960888256, + "loss": 0.8805, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.48008173674551324, + "learning_rate": 0.00019998193432115572, + "loss": 0.8075, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.43163708053698957, + "learning_rate": 0.0001999785004721968, + "loss": 0.8487, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.46084182108467253, + "learning_rate": 0.00019997476807225985, + "loss": 0.7633, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.4725108721087422, + "learning_rate": 0.0001999707371324904, + "loss": 0.8055, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4397202059126982, + "learning_rate": 0.00019996640766492543, + "loss": 0.7986, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.43448440899059, + "learning_rate": 0.00019996177968249334, + "loss": 0.8379, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.48113391072739375, + "learning_rate": 0.0001999568531990141, + "loss": 0.9333, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.43618973002024286, + "learning_rate": 0.00019995162822919883, + "loss": 0.8005, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.4871652179681046, + "learning_rate": 0.00019994610478865011, + "loss": 0.8683, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.4446358368823649, + "learning_rate": 0.0001999402828938618, + "loss": 0.8502, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4248813501781938, + "learning_rate": 0.00019993416256221895, + "loss": 0.8766, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.46175056141708515, + "learning_rate": 0.00019992774381199778, + "loss": 0.7788, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.4459040827699482, + "learning_rate": 0.00019992102666236566, + "loss": 0.8855, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.43452898585972144, + "learning_rate": 0.00019991401113338104, + "loss": 0.7885, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.42858107910034143, + "learning_rate": 0.00019990669724599336, + "loss": 0.8269, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.43497214484723523, + "learning_rate": 0.00019989908502204292, + "loss": 0.8486, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.424602618688084, + "learning_rate": 0.00019989117448426108, + "loss": 0.7366, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.5237333285609881, + "learning_rate": 0.00019988296565626987, + "loss": 0.9504, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.5282217133631587, + "learning_rate": 0.00019987445856258206, + "loss": 0.8777, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.45125628762546255, + "learning_rate": 0.00019986565322860115, + "loss": 0.8052, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.4478849100987688, + "learning_rate": 0.00019985654968062122, + "loss": 0.824, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.5348709935122913, + "learning_rate": 0.00019984714794582683, + "loss": 0.8655, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.39791200867162885, + "learning_rate": 0.00019983744805229296, + "loss": 0.7754, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.44945064830262604, + "learning_rate": 0.000199827450028985, + "loss": 0.784, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.48213644490370783, + "learning_rate": 0.00019981715390575858, + "loss": 0.7894, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.45037742809641995, + "learning_rate": 0.00019980655971335945, + "loss": 0.8057, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.44421092222784353, + "learning_rate": 0.00019979566748342347, + "loss": 0.813, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.4147330635700553, + "learning_rate": 0.00019978447724847652, + "loss": 0.7554, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4361018690916765, + "learning_rate": 0.00019977298904193437, + "loss": 0.8233, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.4548187706211488, + "learning_rate": 0.00019976120289810247, + "loss": 0.8032, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.49615822470903287, + "learning_rate": 0.00019974911885217608, + "loss": 0.8775, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.43517490620884064, + "learning_rate": 0.00019973673694024, + "loss": 0.8126, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.4189323002296131, + "learning_rate": 0.0001997240571992685, + "loss": 0.7906, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.5545137024772776, + "learning_rate": 0.00019971107966712518, + "loss": 0.8504, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.49566000330799753, + "learning_rate": 0.00019969780438256293, + "loss": 0.8578, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.6582761803171807, + "learning_rate": 0.0001996842313852238, + "loss": 0.8038, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.41933245789817597, + "learning_rate": 0.00019967036071563877, + "loss": 0.8217, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.45876305310028254, + "learning_rate": 0.0001996561924152278, + "loss": 0.8069, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.4481832891439225, + "learning_rate": 0.0001996417265262996, + "loss": 0.7822, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.4002701266887735, + "learning_rate": 0.00019962696309205148, + "loss": 0.7476, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4251044019248547, + "learning_rate": 0.0001996119021565693, + "loss": 0.7378, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.4482236948746077, + "learning_rate": 0.0001995965437648273, + "loss": 0.7619, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.3942240120579654, + "learning_rate": 0.00019958088796268793, + "loss": 0.7811, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4285156667145782, + "learning_rate": 0.0001995649347969019, + "loss": 0.7162, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.46219900495041816, + "learning_rate": 0.00019954868431510764, + "loss": 0.7972, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.43503169376098205, + "learning_rate": 0.00019953213656583168, + "loss": 0.8306, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4532641384418678, + "learning_rate": 0.00019951529159848805, + "loss": 0.858, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.5123681549994666, + "learning_rate": 0.00019949814946337838, + "loss": 0.9013, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.4393644420012338, + "learning_rate": 0.00019948071021169174, + "loss": 0.8141, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.38899172381386843, + "learning_rate": 0.00019946297389550433, + "loss": 0.6798, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.44230739320439016, + "learning_rate": 0.00019944494056777946, + "loss": 0.733, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.5149114930756535, + "learning_rate": 0.00019942661028236745, + "loss": 0.8444, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.4485062818049623, + "learning_rate": 0.00019940798309400526, + "loss": 0.7891, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.4636467000362323, + "learning_rate": 0.00019938905905831654, + "loss": 0.8849, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.40070958153035385, + "learning_rate": 0.00019936983823181132, + "loss": 0.7495, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.4268024210922533, + "learning_rate": 0.0001993503206718859, + "loss": 0.7842, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.44017457248078234, + "learning_rate": 0.00019933050643682269, + "loss": 0.7507, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.4902090345072743, + "learning_rate": 0.00019931039558578997, + "loss": 0.8333, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4015997691801862, + "learning_rate": 0.00019928998817884182, + "loss": 0.7465, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.3998053633531983, + "learning_rate": 0.00019926928427691786, + "loss": 0.7843, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.48673822688688767, + "learning_rate": 0.00019924828394184306, + "loss": 0.8761, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.40929288391082347, + "learning_rate": 0.00019922698723632767, + "loss": 0.7823, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.4408492375250808, + "learning_rate": 0.0001992053942239668, + "loss": 0.8763, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.3877134360544658, + "learning_rate": 0.0001991835049692405, + "loss": 0.7505, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4271539457232771, + "learning_rate": 0.00019916131953751342, + "loss": 0.7919, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.4794108836221678, + "learning_rate": 0.0001991388379950346, + "loss": 0.8133, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.4969442947898306, + "learning_rate": 0.0001991160604089374, + "loss": 0.7914, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.48217440979886395, + "learning_rate": 0.00019909298684723904, + "loss": 0.8682, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.41120601922700956, + "learning_rate": 0.00019906961737884077, + "loss": 0.7721, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.46070134695490805, + "learning_rate": 0.00019904595207352737, + "loss": 0.8376, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.44083539879169376, + "learning_rate": 0.00019902199100196697, + "loss": 0.8188, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.4276897943709089, + "learning_rate": 0.000198997734235711, + "loss": 0.7807, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.44924721877515844, + "learning_rate": 0.00019897318184719385, + "loss": 0.8032, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4221092408881946, + "learning_rate": 0.00019894833390973266, + "loss": 0.7598, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.41134688433967886, + "learning_rate": 0.0001989231904975272, + "loss": 0.7283, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.41291215830679434, + "learning_rate": 0.00019889775168565943, + "loss": 0.7853, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.40653646989488545, + "learning_rate": 0.00019887201755009357, + "loss": 0.7657, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.38656478464242355, + "learning_rate": 0.00019884598816767563, + "loss": 0.7072, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.39581179446802794, + "learning_rate": 0.0001988196636161333, + "loss": 0.7828, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4471986990104352, + "learning_rate": 0.0001987930439740757, + "loss": 0.7872, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.4580108314655394, + "learning_rate": 0.00019876612932099308, + "loss": 0.8182, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.46925373163903356, + "learning_rate": 0.0001987389197372567, + "loss": 0.7852, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.4265013835353865, + "learning_rate": 0.00019871141530411853, + "loss": 0.7921, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.4005929185954514, + "learning_rate": 0.00019868361610371097, + "loss": 0.786, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.45260132174087747, + "learning_rate": 0.00019865552221904665, + "loss": 0.8836, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.37954072790512317, + "learning_rate": 0.0001986271337340182, + "loss": 0.7182, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.4522102056887562, + "learning_rate": 0.00019859845073339787, + "loss": 0.8447, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.5369238359841023, + "learning_rate": 0.00019856947330283752, + "loss": 0.8587, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.45061713714114243, + "learning_rate": 0.00019854020152886814, + "loss": 0.8492, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.3965762218434296, + "learning_rate": 0.0001985106354988997, + "loss": 0.7762, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.43393098138537767, + "learning_rate": 0.00019848077530122083, + "loss": 0.8346, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.4222125134860982, + "learning_rate": 0.0001984506210249986, + "loss": 0.7854, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.4327902552736384, + "learning_rate": 0.00019842017276027832, + "loss": 0.815, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.4395089818254555, + "learning_rate": 0.00019838943059798304, + "loss": 0.8101, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4301132582767848, + "learning_rate": 0.00019835839462991361, + "loss": 0.7872, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.4941306433943535, + "learning_rate": 0.0001983270649487481, + "loss": 0.861, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.5003393724639728, + "learning_rate": 0.0001982954416480417, + "loss": 0.7904, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.4959199719206625, + "learning_rate": 0.00019826352482222638, + "loss": 0.838, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.4606645442730648, + "learning_rate": 0.00019823131456661063, + "loss": 0.8272, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.5755397880461268, + "learning_rate": 0.00019819881097737915, + "loss": 0.9147, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.46518023513909984, + "learning_rate": 0.00019816601415159263, + "loss": 0.8505, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.41777269290883634, + "learning_rate": 0.00019813292418718732, + "loss": 0.8053, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.4079364409236901, + "learning_rate": 0.0001980995411829749, + "loss": 0.7325, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4635173524945271, + "learning_rate": 0.0001980658652386421, + "loss": 0.8913, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.43020478512779425, + "learning_rate": 0.0001980318964547504, + "loss": 0.79, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.4922312608876827, + "learning_rate": 0.0001979976349327357, + "loss": 0.844, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4134610957125838, + "learning_rate": 0.00019796308077490817, + "loss": 0.7729, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.5264752394888129, + "learning_rate": 0.00019792823408445174, + "loss": 0.8994, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.501755351477428, + "learning_rate": 0.0001978930949654239, + "loss": 0.8309, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4656418181692643, + "learning_rate": 0.00019785766352275542, + "loss": 0.8631, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.4500200556366832, + "learning_rate": 0.00019782193986224995, + "loss": 0.8611, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.40695089580446, + "learning_rate": 0.00019778592409058378, + "loss": 0.7457, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.43254424843847755, + "learning_rate": 0.00019774961631530545, + "loss": 0.8365, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.38460837023161293, + "learning_rate": 0.0001977130166448355, + "loss": 0.7343, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.4331189167109147, + "learning_rate": 0.00019767612518846608, + "loss": 0.7382, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4504243679472576, + "learning_rate": 0.00019763894205636072, + "loss": 0.818, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.4610839042953873, + "learning_rate": 0.00019760146735955388, + "loss": 0.7838, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.45098999422050184, + "learning_rate": 0.00019756370120995066, + "loss": 0.8295, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4405463454412497, + "learning_rate": 0.00019752564372032657, + "loss": 0.7709, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.42813823129016565, + "learning_rate": 0.000197487295004327, + "loss": 0.7631, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.5041895967649391, + "learning_rate": 0.00019744865517646706, + "loss": 0.8779, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4660587382794085, + "learning_rate": 0.00019740972435213115, + "loss": 0.7709, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.45041274116588015, + "learning_rate": 0.0001973705026475726, + "loss": 0.8405, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.42737248919635235, + "learning_rate": 0.00019733099017991341, + "loss": 0.7928, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4344179415196128, + "learning_rate": 0.00019729118706714375, + "loss": 0.7602, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.47364046417924766, + "learning_rate": 0.0001972510934281218, + "loss": 0.8361, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.4672841088278783, + "learning_rate": 0.00019721070938257324, + "loss": 0.8234, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.4325747981059211, + "learning_rate": 0.00019717003505109095, + "loss": 0.7891, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.48082654673640257, + "learning_rate": 0.0001971290705551347, + "loss": 0.7887, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.42110747240245316, + "learning_rate": 0.00019708781601703065, + "loss": 0.7707, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4522076625082216, + "learning_rate": 0.00019704627155997108, + "loss": 0.7181, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.4596901115843113, + "learning_rate": 0.00019700443730801413, + "loss": 0.8409, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.45036275626350614, + "learning_rate": 0.00019696231338608316, + "loss": 0.8092, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.41744518461348945, + "learning_rate": 0.00019691989991996663, + "loss": 0.7511, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.4835766664050528, + "learning_rate": 0.00019687719703631755, + "loss": 0.8132, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.46211166533568315, + "learning_rate": 0.00019683420486265327, + "loss": 0.7628, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4546632477213524, + "learning_rate": 0.0001967909235273549, + "loss": 0.7828, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.456838499007109, + "learning_rate": 0.0001967473531596671, + "loss": 0.789, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.48846667241796904, + "learning_rate": 0.0001967034938896976, + "loss": 0.7639, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.4320391164167524, + "learning_rate": 0.00019665934584841682, + "loss": 0.8361, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.41527293292274986, + "learning_rate": 0.0001966149091676575, + "loss": 0.7995, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.512836433667297, + "learning_rate": 0.00019657018398011434, + "loss": 0.8323, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.4237823379933958, + "learning_rate": 0.00019652517041934356, + "loss": 0.7709, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.5352301258587794, + "learning_rate": 0.00019647986861976246, + "loss": 0.8263, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.4231418182692326, + "learning_rate": 0.0001964342787166491, + "loss": 0.7834, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.45272515603161934, + "learning_rate": 0.00019638840084614182, + "loss": 0.7671, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.3920941873110034, + "learning_rate": 0.0001963422351452389, + "loss": 0.7368, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.49631245516639083, + "learning_rate": 0.0001962957817517982, + "loss": 0.8451, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.44609129602396586, + "learning_rate": 0.00019624904080453655, + "loss": 0.8309, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.4022816924207915, + "learning_rate": 0.00019620201244302952, + "loss": 0.7527, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.478976632328823, + "learning_rate": 0.00019615469680771096, + "loss": 0.7908, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.45135239518940673, + "learning_rate": 0.00019610709403987246, + "loss": 0.7858, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.4358801136556032, + "learning_rate": 0.00019605920428166323, + "loss": 0.8212, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.39111685562574505, + "learning_rate": 0.00019601102767608923, + "loss": 0.8156, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4797254303453361, + "learning_rate": 0.00019596256436701324, + "loss": 0.8835, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.37436051437624385, + "learning_rate": 0.00019591381449915397, + "loss": 0.7374, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.42537579894871347, + "learning_rate": 0.00019586477821808597, + "loss": 0.8072, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.4347633584472628, + "learning_rate": 0.000195815455670239, + "loss": 0.8094, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.42567851321973016, + "learning_rate": 0.00019576584700289768, + "loss": 0.8078, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.41827443110157314, + "learning_rate": 0.00019571595236420102, + "loss": 0.7221, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4259899522879917, + "learning_rate": 0.00019566577190314197, + "loss": 0.7287, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.4608962828880691, + "learning_rate": 0.00019561530576956703, + "loss": 0.7996, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.4676139355530056, + "learning_rate": 0.00019556455411417573, + "loss": 0.8287, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4210394577520858, + "learning_rate": 0.0001955135170885202, + "loss": 0.7907, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.46891173284904897, + "learning_rate": 0.00019546219484500475, + "loss": 0.8668, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.3575156070876499, + "learning_rate": 0.00019541058753688538, + "loss": 0.7177, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3990387525951872, + "learning_rate": 0.00019535869531826937, + "loss": 0.7482, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.41120803401329314, + "learning_rate": 0.00019530651834411474, + "loss": 0.8065, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.4477273088222328, + "learning_rate": 0.00019525405677022989, + "loss": 0.8217, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.44031792113180557, + "learning_rate": 0.00019520131075327298, + "loss": 0.791, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.438521440691371, + "learning_rate": 0.0001951482804507517, + "loss": 0.8312, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.4647933969283093, + "learning_rate": 0.00019509496602102252, + "loss": 0.7561, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.42542690295590896, + "learning_rate": 0.00019504136762329047, + "loss": 0.7849, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.4283048163569719, + "learning_rate": 0.00019498748541760846, + "loss": 0.7429, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.41729230349711055, + "learning_rate": 0.0001949333195648769, + "loss": 0.7911, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.42409827036893505, + "learning_rate": 0.00019487887022684336, + "loss": 0.7667, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.45555074238723325, + "learning_rate": 0.00019482413756610173, + "loss": 0.8111, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.4275715808233641, + "learning_rate": 0.0001947691217460921, + "loss": 0.7544, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.43475231347946225, + "learning_rate": 0.00019471382293110003, + "loss": 0.7598, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.40237868403890853, + "learning_rate": 0.00019465824128625617, + "loss": 0.755, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.4070690513157304, + "learning_rate": 0.00019460237697753577, + "loss": 0.734, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.4524921752244452, + "learning_rate": 0.00019454623017175812, + "loss": 0.8266, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.43520211044043444, + "learning_rate": 0.00019448980103658613, + "loss": 0.767, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.4503370166116177, + "learning_rate": 0.0001944330897405257, + "loss": 0.7568, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4770182557409676, + "learning_rate": 0.00019437609645292546, + "loss": 0.8477, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.38594426530951254, + "learning_rate": 0.00019431882134397598, + "loss": 0.683, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.4628521389132707, + "learning_rate": 0.00019426126458470936, + "loss": 0.7948, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.4483194544871081, + "learning_rate": 0.0001942034263469989, + "loss": 0.7581, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.4274640982303173, + "learning_rate": 0.00019414530680355837, + "loss": 0.7732, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.4578237135150185, + "learning_rate": 0.00019408690612794148, + "loss": 0.7698, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.40657945119684724, + "learning_rate": 0.00019402822449454153, + "loss": 0.7358, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.45123971773383875, + "learning_rate": 0.00019396926207859084, + "loss": 0.8045, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.4450941319936875, + "learning_rate": 0.0001939100190561601, + "loss": 0.759, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.45149620245292893, + "learning_rate": 0.00019385049560415794, + "loss": 0.806, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.4467897175764718, + "learning_rate": 0.0001937906919003304, + "loss": 0.7645, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.40514445423020395, + "learning_rate": 0.00019373060812326052, + "loss": 0.751, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.48831563084136037, + "learning_rate": 0.00019367024445236754, + "loss": 0.8054, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.42281959883654185, + "learning_rate": 0.00019360960106790643, + "loss": 0.7842, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.39126326593973365, + "learning_rate": 0.0001935486781509677, + "loss": 0.7645, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.41985934472402275, + "learning_rate": 0.00019348747588347637, + "loss": 0.7772, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.48212383198795683, + "learning_rate": 0.00019342599444819168, + "loss": 0.8167, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.41699215934629774, + "learning_rate": 0.00019336423402870653, + "loss": 0.7826, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.3980813154245854, + "learning_rate": 0.00019330219480944694, + "loss": 0.718, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.40892013264312427, + "learning_rate": 0.0001932398769756714, + "loss": 0.7783, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.40845481146044704, + "learning_rate": 0.0001931772807134704, + "loss": 0.7519, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.41275194566666135, + "learning_rate": 0.00019311440620976597, + "loss": 0.755, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.3756995702327026, + "learning_rate": 0.00019305125365231084, + "loss": 0.7828, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.4069881524615693, + "learning_rate": 0.00019298782322968815, + "loss": 0.8123, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.44556873876785796, + "learning_rate": 0.0001929241151313108, + "loss": 0.7432, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.4267954294684829, + "learning_rate": 0.0001928601295474208, + "loss": 0.7555, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.41854023230084636, + "learning_rate": 0.00019279586666908884, + "loss": 0.8007, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.4226803217688415, + "learning_rate": 0.00019273132668821364, + "loss": 0.7786, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.4029724657127977, + "learning_rate": 0.00019266650979752136, + "loss": 0.754, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.49131066949159913, + "learning_rate": 0.00019260141619056507, + "loss": 0.7747, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.47760159762825755, + "learning_rate": 0.00019253604606172417, + "loss": 0.8425, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.4454821627434679, + "learning_rate": 0.0001924703996062038, + "loss": 0.7882, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.3936910423411889, + "learning_rate": 0.0001924044770200342, + "loss": 0.7664, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.6269939926797371, + "learning_rate": 0.00019233827850007027, + "loss": 0.8943, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.4841497625636035, + "learning_rate": 0.0001922718042439908, + "loss": 0.8011, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.4415326311332174, + "learning_rate": 0.000192205054450298, + "loss": 0.765, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.454293611217275, + "learning_rate": 0.00019213802931831696, + "loss": 0.7912, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.46757920591817387, + "learning_rate": 0.00019207072904819486, + "loss": 0.8217, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.46476638206133764, + "learning_rate": 0.00019200315384090044, + "loss": 0.78, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4151735777273285, + "learning_rate": 0.00019193530389822363, + "loss": 0.7033, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.4274306460164623, + "learning_rate": 0.00019186717942277462, + "loss": 0.7739, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.455430053788848, + "learning_rate": 0.00019179878061798347, + "loss": 0.8669, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.41159284980500294, + "learning_rate": 0.00019173010768809933, + "loss": 0.778, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.41012325260301996, + "learning_rate": 0.00019166116083819002, + "loss": 0.754, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.4612699513123622, + "learning_rate": 0.00019159194027414128, + "loss": 0.7306, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.43272865871889055, + "learning_rate": 0.0001915224462026563, + "loss": 0.8341, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.41963200331568457, + "learning_rate": 0.00019145267883125482, + "loss": 0.7798, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.3969513919598155, + "learning_rate": 0.00019138263836827288, + "loss": 0.7619, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.4297719531502467, + "learning_rate": 0.00019131232502286188, + "loss": 0.7681, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.45766795714776487, + "learning_rate": 0.00019124173900498818, + "loss": 0.8352, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.42487143616388595, + "learning_rate": 0.00019117088052543233, + "loss": 0.7428, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.5211110951690558, + "learning_rate": 0.0001910997497957885, + "loss": 0.916, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.3778763748723273, + "learning_rate": 0.00019102834702846387, + "loss": 0.7194, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.4077185288189954, + "learning_rate": 0.0001909566724366779, + "loss": 0.7293, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4208184764998574, + "learning_rate": 0.00019088472623446183, + "loss": 0.7764, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.42027135789116177, + "learning_rate": 0.00019081250863665794, + "loss": 0.788, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.40554900276058214, + "learning_rate": 0.0001907400198589189, + "loss": 0.761, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.42742857255717165, + "learning_rate": 0.00019066726011770726, + "loss": 0.7439, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.4601886210902722, + "learning_rate": 0.00019059422963029464, + "loss": 0.8274, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.478589187295555, + "learning_rate": 0.0001905209286147611, + "loss": 0.778, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4633748369336836, + "learning_rate": 0.0001904473572899947, + "loss": 0.8225, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.4083129578873108, + "learning_rate": 0.0001903735158756905, + "loss": 0.7405, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.620832278768044, + "learning_rate": 0.0001902994045923502, + "loss": 0.7697, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.4077726180167016, + "learning_rate": 0.00019022502366128135, + "loss": 0.7279, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.4834457372161605, + "learning_rate": 0.0001901503733045967, + "loss": 0.8183, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.44163255685108305, + "learning_rate": 0.00019007545374521355, + "loss": 0.7696, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4687268712645231, + "learning_rate": 0.00019000026520685302, + "loss": 0.7692, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.43966287562173245, + "learning_rate": 0.00018992480791403958, + "loss": 0.7565, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.46945289527620604, + "learning_rate": 0.0001898490820921001, + "loss": 0.8155, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.40981539825547586, + "learning_rate": 0.0001897730879671634, + "loss": 0.7615, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.4528331049187482, + "learning_rate": 0.0001896968257661595, + "loss": 0.7866, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.4279879948857359, + "learning_rate": 0.00018962029571681886, + "loss": 0.806, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4567720668919121, + "learning_rate": 0.00018954349804767184, + "loss": 0.8126, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.42559932918287957, + "learning_rate": 0.00018946643298804793, + "loss": 0.751, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.4374833787533294, + "learning_rate": 0.00018938910076807513, + "loss": 0.7758, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.400521048887267, + "learning_rate": 0.00018931150161867916, + "loss": 0.7182, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.4411305070030174, + "learning_rate": 0.0001892336357715829, + "loss": 0.7422, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.3960947529502556, + "learning_rate": 0.0001891555034593055, + "loss": 0.706, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.3983703097172135, + "learning_rate": 0.00018907710491516199, + "loss": 0.7251, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.45846564057134404, + "learning_rate": 0.00018899844037326225, + "loss": 0.7802, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.41159932485400996, + "learning_rate": 0.0001889195100685106, + "loss": 0.7601, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.447424805825611, + "learning_rate": 0.0001888403142366049, + "loss": 0.8029, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.40604851956494914, + "learning_rate": 0.00018876085311403593, + "loss": 0.7416, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.45501976602393485, + "learning_rate": 0.00018868112693808665, + "loss": 0.8366, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.42459722612502765, + "learning_rate": 0.00018860113594683148, + "loss": 0.7481, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.4919631166230819, + "learning_rate": 0.00018852088037913577, + "loss": 0.7855, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.43033665282321837, + "learning_rate": 0.0001884403604746547, + "loss": 0.7469, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.3804600692068997, + "learning_rate": 0.00018835957647383303, + "loss": 0.756, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.3941496431808659, + "learning_rate": 0.00018827852861790398, + "loss": 0.6994, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.4135361886334295, + "learning_rate": 0.00018819721714888877, + "loss": 0.787, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4036937660863021, + "learning_rate": 0.00018811564230959588, + "loss": 0.6809, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.4144768391836234, + "learning_rate": 0.00018803380434362, + "loss": 0.7235, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.414924772270386, + "learning_rate": 0.0001879517034953418, + "loss": 0.7745, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.5104440648202117, + "learning_rate": 0.00018786934000992688, + "loss": 0.814, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.4510674205737547, + "learning_rate": 0.00018778671413332513, + "loss": 0.8003, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.4615965741984511, + "learning_rate": 0.00018770382611226987, + "loss": 0.7685, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3867991235423707, + "learning_rate": 0.00018762067619427746, + "loss": 0.7224, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.4064479524634665, + "learning_rate": 0.000187537264627646, + "loss": 0.7762, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.4437818670338459, + "learning_rate": 0.00018745359166145523, + "loss": 0.7865, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.45360184922661184, + "learning_rate": 0.00018736965754556528, + "loss": 0.7674, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.4196062484442272, + "learning_rate": 0.00018728546253061614, + "loss": 0.7544, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.4803602897556589, + "learning_rate": 0.00018720100686802694, + "loss": 0.8425, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.49072751080915683, + "learning_rate": 0.00018711629080999504, + "loss": 0.8314, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.47772940016280147, + "learning_rate": 0.00018703131460949554, + "loss": 0.8408, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.42634504050193084, + "learning_rate": 0.0001869460785202802, + "loss": 0.7796, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.34872832935612424, + "learning_rate": 0.00018686058279687698, + "loss": 0.6529, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.4980111280217522, + "learning_rate": 0.00018677482769458904, + "loss": 0.796, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.39347240314594795, + "learning_rate": 0.00018668881346949417, + "loss": 0.7456, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.4051626526402004, + "learning_rate": 0.00018660254037844388, + "loss": 0.7584, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.48572228625634567, + "learning_rate": 0.00018651600867906272, + "loss": 0.747, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.41051675505313506, + "learning_rate": 0.00018642921862974742, + "loss": 0.7866, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4152477619489462, + "learning_rate": 0.00018634217048966637, + "loss": 0.7572, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.4682045694000191, + "learning_rate": 0.00018625486451875843, + "loss": 0.7866, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.4064667329570052, + "learning_rate": 0.0001861673009777325, + "loss": 0.7861, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4351942533622352, + "learning_rate": 0.0001860794801280666, + "loss": 0.8325, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.432979227691755, + "learning_rate": 0.00018599140223200716, + "loss": 0.7952, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.44332700796138425, + "learning_rate": 0.0001859030675525681, + "loss": 0.7879, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.39922637527320176, + "learning_rate": 0.0001858144763535302, + "loss": 0.6837, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.41532893705236523, + "learning_rate": 0.0001857256288994402, + "loss": 0.7282, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.45028989276063536, + "learning_rate": 0.00018563652545561013, + "loss": 0.7992, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4512458676474834, + "learning_rate": 0.0001855471662881164, + "loss": 0.822, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.41483902235497905, + "learning_rate": 0.000185457551663799, + "loss": 0.7586, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.479064586610721, + "learning_rate": 0.00018536768185026083, + "loss": 0.8433, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.42314123383647745, + "learning_rate": 0.00018527755711586678, + "loss": 0.7837, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.3852527022550998, + "learning_rate": 0.00018518717772974302, + "loss": 0.7022, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.34546969848499054, + "learning_rate": 0.00018509654396177609, + "loss": 0.6765, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.39831988638266946, + "learning_rate": 0.00018500565608261214, + "loss": 0.7054, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.40016473875548725, + "learning_rate": 0.00018491451436365627, + "loss": 0.7309, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.41862062761699875, + "learning_rate": 0.0001848231190770714, + "loss": 0.7649, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.4216060227393941, + "learning_rate": 0.00018473147049577774, + "loss": 0.7933, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.38746765063921507, + "learning_rate": 0.00018463956889345194, + "loss": 0.6973, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.42422814568525985, + "learning_rate": 0.00018454741454452603, + "loss": 0.7335, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4175831076791634, + "learning_rate": 0.00018445500772418697, + "loss": 0.7776, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.4488967788947463, + "learning_rate": 0.00018436234870837547, + "loss": 0.8528, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.3804940783199971, + "learning_rate": 0.00018426943777378552, + "loss": 0.7103, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.39265951485970907, + "learning_rate": 0.00018417627519786315, + "loss": 0.7021, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.4170108586130046, + "learning_rate": 0.00018408286125880604, + "loss": 0.7074, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.41255415960615377, + "learning_rate": 0.00018398919623556238, + "loss": 0.7455, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.41316936831534123, + "learning_rate": 0.00018389528040783012, + "loss": 0.7786, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.4522356929167597, + "learning_rate": 0.0001838011140560562, + "loss": 0.8267, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.4199394826524308, + "learning_rate": 0.00018370669746143564, + "loss": 0.786, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.43124818409602006, + "learning_rate": 0.00018361203090591071, + "loss": 0.697, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.45302212707459977, + "learning_rate": 0.0001835171146721701, + "loss": 0.7375, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.38967910480164086, + "learning_rate": 0.00018342194904364813, + "loss": 0.7245, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4413737693750598, + "learning_rate": 0.00018332653430452376, + "loss": 0.783, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.4270130265037395, + "learning_rate": 0.00018323087073971993, + "loss": 0.7503, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.3981751574168622, + "learning_rate": 0.00018313495863490258, + "loss": 0.7069, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4343883652859093, + "learning_rate": 0.00018303879827647975, + "loss": 0.763, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.43844604515187074, + "learning_rate": 0.00018294238995160094, + "loss": 0.792, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.4848368467011727, + "learning_rate": 0.00018284573394815597, + "loss": 0.7928, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4670982578822998, + "learning_rate": 0.00018274883055477436, + "loss": 0.8336, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.41721696610767667, + "learning_rate": 0.00018265168006082437, + "loss": 0.7267, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.5100412521942448, + "learning_rate": 0.00018255428275641214, + "loss": 0.8037, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.423327472770163, + "learning_rate": 0.00018245663893238075, + "loss": 0.7078, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.39903295305294834, + "learning_rate": 0.0001823587488803095, + "loss": 0.78, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.4273087221317984, + "learning_rate": 0.00018226061289251298, + "loss": 0.7656, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4701733296751263, + "learning_rate": 0.00018216223126204007, + "loss": 0.7457, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.4849339935513332, + "learning_rate": 0.00018206360428267332, + "loss": 0.8422, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.4365292534868585, + "learning_rate": 0.00018196473224892784, + "loss": 0.7459, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.45540180676703806, + "learning_rate": 0.00018186561545605054, + "loss": 0.7434, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.3821015207412149, + "learning_rate": 0.0001817662542000192, + "loss": 0.7346, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.4237340453439172, + "learning_rate": 0.0001816666487775416, + "loss": 0.7572, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4260188088738735, + "learning_rate": 0.00018156679948605467, + "loss": 0.7807, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.390168704071244, + "learning_rate": 0.00018146670662372354, + "loss": 0.7416, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.3834271348861522, + "learning_rate": 0.0001813663704894407, + "loss": 0.7651, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.42596724690668536, + "learning_rate": 0.00018126579138282503, + "loss": 0.8005, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.41677988711146163, + "learning_rate": 0.00018116496960422107, + "loss": 0.7555, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.38915845494043444, + "learning_rate": 0.00018106390545469795, + "loss": 0.7186, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.40347775870221103, + "learning_rate": 0.0001809625992360485, + "loss": 0.7605, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.4255599589983862, + "learning_rate": 0.00018086105125078857, + "loss": 0.7362, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.4357997850956404, + "learning_rate": 0.00018075926180215576, + "loss": 0.7813, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.41914054066713735, + "learning_rate": 0.00018065723119410884, + "loss": 0.78, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.44432795823533866, + "learning_rate": 0.0001805549597313267, + "loss": 0.7659, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.38912853603617054, + "learning_rate": 0.0001804524477192075, + "loss": 0.7743, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.46612585654323574, + "learning_rate": 0.00018034969546386757, + "loss": 0.8243, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.3984567394637064, + "learning_rate": 0.00018024670327214084, + "loss": 0.7212, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.40799714213879, + "learning_rate": 0.00018014347145157755, + "loss": 0.7352, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.43347839663468984, + "learning_rate": 0.0001800400003104436, + "loss": 0.782, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.4253061709077604, + "learning_rate": 0.0001799362901577196, + "loss": 0.7764, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.42087057607363554, + "learning_rate": 0.00017983234130309968, + "loss": 0.7868, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.43254532172608784, + "learning_rate": 0.00017972815405699103, + "loss": 0.7589, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.3914305001715474, + "learning_rate": 0.00017962372873051252, + "loss": 0.7172, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.4321606521827664, + "learning_rate": 0.00017951906563549397, + "loss": 0.8013, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.38543498665079473, + "learning_rate": 0.00017941416508447536, + "loss": 0.7057, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.43362129868488725, + "learning_rate": 0.00017930902739070562, + "loss": 0.8642, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.41177356200450155, + "learning_rate": 0.00017920365286814183, + "loss": 0.7745, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4252255336463102, + "learning_rate": 0.0001790980418314484, + "loss": 0.7475, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.44562431381397916, + "learning_rate": 0.0001789921945959958, + "loss": 0.8311, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.45539376204648174, + "learning_rate": 0.00017888611147786002, + "loss": 0.7755, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.4034765961175896, + "learning_rate": 0.00017877979279382135, + "loss": 0.7289, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.4141539447877892, + "learning_rate": 0.00017867323886136348, + "loss": 0.7284, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.3813398646439378, + "learning_rate": 0.00017856644999867264, + "loss": 0.6716, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.4291614520514329, + "learning_rate": 0.0001784594265246366, + "loss": 0.7606, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.41686238869141945, + "learning_rate": 0.00017835216875884368, + "loss": 0.7308, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.4201586702551558, + "learning_rate": 0.0001782446770215819, + "loss": 0.7733, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4444739909491757, + "learning_rate": 0.0001781369516338378, + "loss": 0.7768, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.4435520129736999, + "learning_rate": 0.00017802899291729585, + "loss": 0.8036, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.3649018873711, + "learning_rate": 0.0001779208011943371, + "loss": 0.745, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.3444176443967446, + "learning_rate": 0.00017781237678803847, + "loss": 0.6535, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.4298665170208588, + "learning_rate": 0.00017770372002217172, + "loss": 0.8094, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.40676712466276643, + "learning_rate": 0.00017759483122120238, + "loss": 0.7194, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.43694320022290134, + "learning_rate": 0.000177485710710289, + "loss": 0.7734, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.4159195167919046, + "learning_rate": 0.00017737635881528196, + "loss": 0.8063, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.4483277895803319, + "learning_rate": 0.00017726677586272263, + "loss": 0.7298, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4747362557925032, + "learning_rate": 0.00017715696217984235, + "loss": 0.7752, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.35323565552285224, + "learning_rate": 0.00017704691809456143, + "loss": 0.6711, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.41893469671433203, + "learning_rate": 0.0001769366439354882, + "loss": 0.7222, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.39907565151968166, + "learning_rate": 0.00017682614003191807, + "loss": 0.7211, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.35675320654518644, + "learning_rate": 0.00017671540671383243, + "loss": 0.6646, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.4270554230200422, + "learning_rate": 0.0001766044443118978, + "loss": 0.7759, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.40229978004171096, + "learning_rate": 0.00017649325315746478, + "loss": 0.6955, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.40312632692715095, + "learning_rate": 0.00017638183358256696, + "loss": 0.7573, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.4113460109150638, + "learning_rate": 0.00017627018591992018, + "loss": 0.7848, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.3618275117102789, + "learning_rate": 0.0001761583105029213, + "loss": 0.6895, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.37828688022097967, + "learning_rate": 0.00017604620766564723, + "loss": 0.6574, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.36553798988091857, + "learning_rate": 0.00017593387774285412, + "loss": 0.6943, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.42836526001614417, + "learning_rate": 0.00017582132106997616, + "loss": 0.8108, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.3877425115957267, + "learning_rate": 0.0001757085379831246, + "loss": 0.7378, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.3977654548543868, + "learning_rate": 0.00017559552881908695, + "loss": 0.7236, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.37716494049838983, + "learning_rate": 0.00017548229391532572, + "loss": 0.7261, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.45903834158387774, + "learning_rate": 0.00017536883360997743, + "loss": 0.7993, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.4169406540559001, + "learning_rate": 0.00017525514824185185, + "loss": 0.7397, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.45744325740569985, + "learning_rate": 0.00017514123815043074, + "loss": 0.7573, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.4001821206604834, + "learning_rate": 0.00017502710367586687, + "loss": 0.7025, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.39634111966326696, + "learning_rate": 0.0001749127451589832, + "loss": 0.6665, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.5070556136631094, + "learning_rate": 0.00017479816294127152, + "loss": 0.8214, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.38470652364492003, + "learning_rate": 0.00017468335736489177, + "loss": 0.6866, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.40788529857724415, + "learning_rate": 0.00017456832877267084, + "loss": 0.7448, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.4278925754220809, + "learning_rate": 0.0001744530775081015, + "loss": 0.7503, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.4428610484994966, + "learning_rate": 0.00017433760391534167, + "loss": 0.7242, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.3869065267998443, + "learning_rate": 0.00017422190833921283, + "loss": 0.7129, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4282684396189687, + "learning_rate": 0.0001741059911251997, + "loss": 0.7906, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.43378828051890717, + "learning_rate": 0.00017398985261944856, + "loss": 0.7897, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.39590989306573193, + "learning_rate": 0.00017387349316876666, + "loss": 0.7515, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4865555425675011, + "learning_rate": 0.000173756913120621, + "loss": 0.7963, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.42908700805942523, + "learning_rate": 0.0001736401128231373, + "loss": 0.8124, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.39729606754853064, + "learning_rate": 0.00017352309262509894, + "loss": 0.7174, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4422087780106003, + "learning_rate": 0.00017340585287594604, + "loss": 0.8041, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.4319822642481652, + "learning_rate": 0.0001732883939257742, + "loss": 0.8377, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.39146511331112815, + "learning_rate": 0.0001731707161253338, + "loss": 0.7496, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3872969620787598, + "learning_rate": 0.0001730528198260285, + "loss": 0.742, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.3722417379050899, + "learning_rate": 0.00017293470537991463, + "loss": 0.7036, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.427224706505018, + "learning_rate": 0.00017281637313969978, + "loss": 0.7732, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.3659168170781193, + "learning_rate": 0.00017269782345874203, + "loss": 0.6904, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.38312089240575625, + "learning_rate": 0.00017257905669104874, + "loss": 0.6751, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.47528377786799064, + "learning_rate": 0.00017246007319127545, + "loss": 0.7232, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5043159420482285, + "learning_rate": 0.00017234087331472497, + "loss": 0.7561, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.4071912729726721, + "learning_rate": 0.00017222145741734626, + "loss": 0.7082, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.3692359251504417, + "learning_rate": 0.00017210182585573327, + "loss": 0.7264, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.43310810077684875, + "learning_rate": 0.00017198197898712404, + "loss": 0.7667, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.4303282860540202, + "learning_rate": 0.00017186191716939944, + "loss": 0.752, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.4892142742692592, + "learning_rate": 0.0001717416407610824, + "loss": 0.7797, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.47756245089452326, + "learning_rate": 0.00017162115012133643, + "loss": 0.8877, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.44826648467260455, + "learning_rate": 0.00017150044560996488, + "loss": 0.7485, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.3986453148625839, + "learning_rate": 0.00017137952758740978, + "loss": 0.7154, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.46155684944406006, + "learning_rate": 0.00017125839641475072, + "loss": 0.7527, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.38390418611076577, + "learning_rate": 0.00017113705245370368, + "loss": 0.7593, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.39270062767400077, + "learning_rate": 0.00017101549606662024, + "loss": 0.7018, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.4368403876738737, + "learning_rate": 0.00017089372761648616, + "loss": 0.7272, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.38999942509832336, + "learning_rate": 0.00017077174746692056, + "loss": 0.6816, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.4413102369849412, + "learning_rate": 0.00017064955598217462, + "loss": 0.7619, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4362991249943504, + "learning_rate": 0.00017052715352713075, + "loss": 0.7747, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.39460974516061925, + "learning_rate": 0.00017040454046730115, + "loss": 0.7176, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.45970430321639394, + "learning_rate": 0.00017028171716882714, + "loss": 0.7949, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4118570173992488, + "learning_rate": 0.00017015868399847768, + "loss": 0.7681, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.39082874113516197, + "learning_rate": 0.00017003544132364846, + "loss": 0.7562, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.44533486466923117, + "learning_rate": 0.00016991198951236088, + "loss": 0.7804, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4308733152481316, + "learning_rate": 0.00016978832893326074, + "loss": 0.7762, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.4078213675567917, + "learning_rate": 0.00016966445995561727, + "loss": 0.8142, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.46235105153878314, + "learning_rate": 0.00016954038294932216, + "loss": 0.7766, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.42235316836926556, + "learning_rate": 0.00016941609828488807, + "loss": 0.7069, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.4307082196447437, + "learning_rate": 0.0001692916063334479, + "loss": 0.729, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.40872749605019615, + "learning_rate": 0.0001691669074667535, + "loss": 0.7246, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.4170055420382616, + "learning_rate": 0.0001690420020571747, + "loss": 0.7001, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.4946542189552747, + "learning_rate": 0.0001689168904776979, + "loss": 0.7429, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.413904366744721, + "learning_rate": 0.00016879157310192535, + "loss": 0.6733, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3846958968706317, + "learning_rate": 0.0001686660503040737, + "loss": 0.6999, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.39876829263664965, + "learning_rate": 0.00016854032245897308, + "loss": 0.7422, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.3984058117963898, + "learning_rate": 0.00016841438994206595, + "loss": 0.7055, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.3856828329145514, + "learning_rate": 0.00016828825312940592, + "loss": 0.7004, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.43491636219821733, + "learning_rate": 0.00016816191239765667, + "loss": 0.7563, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.4554947566059297, + "learning_rate": 0.00016803536812409075, + "loss": 0.7972, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4000537333043658, + "learning_rate": 0.0001679086206865886, + "loss": 0.7201, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.4248067779621954, + "learning_rate": 0.00016778167046363734, + "loss": 0.6962, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.42771621442800506, + "learning_rate": 0.00016765451783432953, + "loss": 0.7559, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4100187500353154, + "learning_rate": 0.00016752716317836229, + "loss": 0.7197, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.4177323128557551, + "learning_rate": 0.0001673996068760359, + "loss": 0.7485, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.46767615316900574, + "learning_rate": 0.00016727184930825288, + "loss": 0.8013, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.3972361306178011, + "learning_rate": 0.0001671438908565167, + "loss": 0.7479, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.44556085338785484, + "learning_rate": 0.00016701573190293077, + "loss": 0.7711, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.3905199839578012, + "learning_rate": 0.00016688737283019706, + "loss": 0.7301, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.41990641788526667, + "learning_rate": 0.00016675881402161536, + "loss": 0.747, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.39307884536736915, + "learning_rate": 0.00016663005586108176, + "loss": 0.7578, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.4117627250864761, + "learning_rate": 0.00016650109873308765, + "loss": 0.783, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4932581378692019, + "learning_rate": 0.0001663719430227186, + "loss": 0.813, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.4586273614922315, + "learning_rate": 0.0001662425891156531, + "loss": 0.8288, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.4019580126071487, + "learning_rate": 0.00016611303739816168, + "loss": 0.7366, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.46632431465167107, + "learning_rate": 0.00016598328825710533, + "loss": 0.7385, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.3905508336380154, + "learning_rate": 0.00016585334207993476, + "loss": 0.7321, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.4315518883593066, + "learning_rate": 0.00016572319925468892, + "loss": 0.746, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5496492145646412, + "learning_rate": 0.000165592860169994, + "loss": 0.8785, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.3935500643047362, + "learning_rate": 0.0001654623252150624, + "loss": 0.7155, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.3738068156508414, + "learning_rate": 0.00016533159477969122, + "loss": 0.7019, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.4522757933248983, + "learning_rate": 0.00016520066925426144, + "loss": 0.7664, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.38570043302049833, + "learning_rate": 0.00016506954902973655, + "loss": 0.7314, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.39520128159650453, + "learning_rate": 0.00016493823449766136, + "loss": 0.7076, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.34875746220500975, + "learning_rate": 0.0001648067260501611, + "loss": 0.6401, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.3850207062278375, + "learning_rate": 0.00016467502407993992, + "loss": 0.7624, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.39356618449823616, + "learning_rate": 0.0001645431289802799, + "loss": 0.7934, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.41023404326225077, + "learning_rate": 0.0001644110411450398, + "loss": 0.7533, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.38185451478416427, + "learning_rate": 0.00016427876096865394, + "loss": 0.6844, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.43521746572164594, + "learning_rate": 0.00016414628884613107, + "loss": 0.7764, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.37052202040599447, + "learning_rate": 0.00016401362517305296, + "loss": 0.7119, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.40201740247949785, + "learning_rate": 0.00016388077034557355, + "loss": 0.7365, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.4173521352111898, + "learning_rate": 0.00016374772476041748, + "loss": 0.7902, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.41543959694153865, + "learning_rate": 0.00016361448881487914, + "loss": 0.7467, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.46137903716965895, + "learning_rate": 0.00016348106290682118, + "loss": 0.7581, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.4078436798194787, + "learning_rate": 0.00016334744743467364, + "loss": 0.7116, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.4415643373573497, + "learning_rate": 0.00016321364279743266, + "loss": 0.7527, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.45056751725975336, + "learning_rate": 0.00016307964939465914, + "loss": 0.7513, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.38398698181612956, + "learning_rate": 0.00016294546762647775, + "loss": 0.7066, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.47607256049205215, + "learning_rate": 0.0001628110978935756, + "loss": 0.8087, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.46320237969409656, + "learning_rate": 0.0001626765405972011, + "loss": 0.7637, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.3900926639602743, + "learning_rate": 0.00016254179613916278, + "loss": 0.759, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3945527842305872, + "learning_rate": 0.00016240686492182804, + "loss": 0.6851, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.38971042494020103, + "learning_rate": 0.000162271747348122, + "loss": 0.7286, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.4045725727158994, + "learning_rate": 0.0001621364438215262, + "loss": 0.7351, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.44749498801010373, + "learning_rate": 0.00016200095474607753, + "loss": 0.8119, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.42103833073666364, + "learning_rate": 0.00016186528052636692, + "loss": 0.7662, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.3775041905408943, + "learning_rate": 0.0001617294215675382, + "loss": 0.6972, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.43153381420568787, + "learning_rate": 0.00016159337827528685, + "loss": 0.7316, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.39947290789932155, + "learning_rate": 0.0001614571510558588, + "loss": 0.7309, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.45729284062204945, + "learning_rate": 0.00016132074031604917, + "loss": 0.6902, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.48660864694329276, + "learning_rate": 0.0001611841464632011, + "loss": 0.8257, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.4825049084275357, + "learning_rate": 0.00016104736990520468, + "loss": 0.8216, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.3935027451385052, + "learning_rate": 0.0001609104110504954, + "loss": 0.6903, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.42125994179094617, + "learning_rate": 0.0001607732703080532, + "loss": 0.724, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.44333295681260426, + "learning_rate": 0.00016063594808740113, + "loss": 0.6565, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.4313425441161452, + "learning_rate": 0.00016049844479860422, + "loss": 0.7675, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.3772164435988223, + "learning_rate": 0.00016036076085226814, + "loss": 0.6755, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.42915178129946013, + "learning_rate": 0.00016022289665953808, + "loss": 0.7515, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.4184007203040218, + "learning_rate": 0.00016008485263209742, + "loss": 0.7358, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.40765011752081065, + "learning_rate": 0.0001599466291821666, + "loss": 0.7265, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.3951190521913299, + "learning_rate": 0.0001598082267225018, + "loss": 0.6542, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.3855910412512755, + "learning_rate": 0.0001596696456663938, + "loss": 0.7006, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.42424025815136057, + "learning_rate": 0.0001595308864276666, + "loss": 0.7634, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.3877096634029584, + "learning_rate": 0.00015939194942067646, + "loss": 0.7645, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.4755030554081805, + "learning_rate": 0.0001592528350603103, + "loss": 0.7008, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.3945273065026347, + "learning_rate": 0.0001591135437619847, + "loss": 0.7261, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.4322918940947344, + "learning_rate": 0.00015897407594164467, + "loss": 0.7855, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.4008059808950541, + "learning_rate": 0.00015883443201576225, + "loss": 0.7038, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.42735681713122126, + "learning_rate": 0.0001586946124013354, + "loss": 0.7303, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.44799972860329035, + "learning_rate": 0.00015855461751588677, + "loss": 0.7592, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.429338520686409, + "learning_rate": 0.0001584144477774623, + "loss": 0.7074, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4465954476240394, + "learning_rate": 0.0001582741036046301, + "loss": 0.7319, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.4900134635713606, + "learning_rate": 0.00015813358541647915, + "loss": 0.8457, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.4207105842197505, + "learning_rate": 0.00015799289363261813, + "loss": 0.7544, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.42451770322137466, + "learning_rate": 0.00015785202867317407, + "loss": 0.7044, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.3589686471116743, + "learning_rate": 0.00015771099095879108, + "loss": 0.6743, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.4450708223693317, + "learning_rate": 0.0001575697809106292, + "loss": 0.71, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.40686694321392736, + "learning_rate": 0.00015742839895036305, + "loss": 0.688, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.39753103284223834, + "learning_rate": 0.00015728684550018064, + "loss": 0.7351, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.409828206041139, + "learning_rate": 0.0001571451209827821, + "loss": 0.745, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.3667751581575778, + "learning_rate": 0.00015700322582137827, + "loss": 0.6701, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.4484573939780761, + "learning_rate": 0.00015686116043968972, + "loss": 0.7822, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.4242610062223099, + "learning_rate": 0.00015671892526194516, + "loss": 0.7993, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4532416341596631, + "learning_rate": 0.0001565765207128805, + "loss": 0.8287, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.4887705723429555, + "learning_rate": 0.0001564339472177373, + "loss": 0.792, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.48812937766764497, + "learning_rate": 0.00015629120520226165, + "loss": 0.7167, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.40975242626263575, + "learning_rate": 0.0001561482950927029, + "loss": 0.6962, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.3697513759304386, + "learning_rate": 0.0001560052173158123, + "loss": 0.7037, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.4373933446685291, + "learning_rate": 0.00015586197229884184, + "loss": 0.7209, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3852841466947889, + "learning_rate": 0.00015571856046954285, + "loss": 0.7435, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.44567788187910024, + "learning_rate": 0.00015557498225616487, + "loss": 0.7432, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.4685861065616478, + "learning_rate": 0.0001554312380874542, + "loss": 0.8281, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.5071683487190545, + "learning_rate": 0.00015528732839265272, + "loss": 0.8902, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.4164121017068378, + "learning_rate": 0.00015514325360149668, + "loss": 0.7467, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.4111203128036995, + "learning_rate": 0.0001549990141442153, + "loss": 0.7431, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.4100653264715916, + "learning_rate": 0.0001548546104515294, + "loss": 0.7005, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.452828147400987, + "learning_rate": 0.00015471004295465035, + "loss": 0.7994, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.40800931677571645, + "learning_rate": 0.0001545653120852787, + "loss": 0.7733, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.44311289069816373, + "learning_rate": 0.00015442041827560274, + "loss": 0.7218, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.4271579407616302, + "learning_rate": 0.00015427536195829742, + "loss": 0.8083, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.5112305632950536, + "learning_rate": 0.00015413014356652286, + "loss": 0.8084, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4006633008920473, + "learning_rate": 0.00015398476353392323, + "loss": 0.7126, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.40727304965830635, + "learning_rate": 0.00015383922229462549, + "loss": 0.7368, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.477949911384884, + "learning_rate": 0.00015369352028323774, + "loss": 0.8195, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4526096288861997, + "learning_rate": 0.00015354765793484834, + "loss": 0.8008, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.4512229875267074, + "learning_rate": 0.0001534016356850244, + "loss": 0.7436, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.43480158539272745, + "learning_rate": 0.0001532554539698105, + "loss": 0.7677, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.39829699026731086, + "learning_rate": 0.00015310911322572753, + "loss": 0.7303, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.43235734102891127, + "learning_rate": 0.00015296261388977108, + "loss": 0.7614, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.40414847698605944, + "learning_rate": 0.0001528159563994104, + "loss": 0.7194, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.42756992399038113, + "learning_rate": 0.000152669141192587, + "loss": 0.7621, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.40257715695337964, + "learning_rate": 0.00015252216870771345, + "loss": 0.7167, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.43021398031232083, + "learning_rate": 0.00015237503938367186, + "loss": 0.7408, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3759716060216146, + "learning_rate": 0.00015222775365981273, + "loss": 0.6591, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.4931180390492152, + "learning_rate": 0.00015208031197595356, + "loss": 0.7957, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.4355637206307468, + "learning_rate": 0.0001519327147723776, + "loss": 0.7681, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.37464229247023656, + "learning_rate": 0.00015178496248983254, + "loss": 0.6569, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.4020759123911303, + "learning_rate": 0.0001516370555695291, + "loss": 0.7167, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.4221289896570997, + "learning_rate": 0.00015148899445313981, + "loss": 0.7557, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3939997279620741, + "learning_rate": 0.00015134077958279765, + "loss": 0.6649, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.3917110467034314, + "learning_rate": 0.00015119241140109467, + "loss": 0.7077, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.43924193247345256, + "learning_rate": 0.00015104389035108077, + "loss": 0.7942, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.3969193877305016, + "learning_rate": 0.00015089521687626243, + "loss": 0.6673, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.45038221706615134, + "learning_rate": 0.0001507463914206012, + "loss": 0.7246, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.42164306602391805, + "learning_rate": 0.0001505974144285124, + "loss": 0.7425, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.40140230723508524, + "learning_rate": 0.000150448286344864, + "loss": 0.6991, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.4750484854460713, + "learning_rate": 0.00015029900761497506, + "loss": 0.7824, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.4455116350805326, + "learning_rate": 0.00015014957868461458, + "loss": 0.8367, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.44618110095389835, + "learning_rate": 0.00015000000000000001, + "loss": 0.7152, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.4475622955097561, + "learning_rate": 0.000149850272007796, + "loss": 0.761, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.4359749671738415, + "learning_rate": 0.00014970039515511304, + "loss": 0.7621, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.38747409675067007, + "learning_rate": 0.00014955036988950618, + "loss": 0.7391, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.4164598379253978, + "learning_rate": 0.0001494001966589736, + "loss": 0.7612, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.4229031544604517, + "learning_rate": 0.00014924987591195547, + "loss": 0.7445, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3673383598153167, + "learning_rate": 0.00014909940809733222, + "loss": 0.7138, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.39070215256485175, + "learning_rate": 0.0001489487936644237, + "loss": 0.7118, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.42452823633461184, + "learning_rate": 0.00014879803306298736, + "loss": 0.7388, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4263707701419675, + "learning_rate": 0.00014864712674321734, + "loss": 0.7526, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.3689626812113726, + "learning_rate": 0.00014849607515574276, + "loss": 0.7108, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.41376531374820275, + "learning_rate": 0.00014834487875162657, + "loss": 0.7284, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.3923399191140614, + "learning_rate": 0.00014819353798236427, + "loss": 0.7255, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.4748090626326111, + "learning_rate": 0.00014804205329988225, + "loss": 0.7893, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.4158425725741022, + "learning_rate": 0.00014789042515653687, + "loss": 0.7224, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4459627005422889, + "learning_rate": 0.00014773865400511272, + "loss": 0.7157, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.4273511318023322, + "learning_rate": 0.00014758674029882152, + "loss": 0.7572, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.37482426750995995, + "learning_rate": 0.00014743468449130063, + "loss": 0.6706, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4006428451318836, + "learning_rate": 0.00014728248703661182, + "loss": 0.7872, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.4083100773492591, + "learning_rate": 0.00014713014838923976, + "loss": 0.7192, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.3265496595368322, + "learning_rate": 0.00014697766900409074, + "loss": 0.6042, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.5120580846177375, + "learning_rate": 0.00014682504933649144, + "loss": 0.8355, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.39935262745294176, + "learning_rate": 0.0001466722898421873, + "loss": 0.6913, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.38092835117157264, + "learning_rate": 0.0001465193909773413, + "loss": 0.7005, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.386526250523854, + "learning_rate": 0.00014636635319853275, + "loss": 0.6555, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.4667619972650561, + "learning_rate": 0.00014621317696275564, + "loss": 0.8299, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.3839903028142022, + "learning_rate": 0.00014605986272741748, + "loss": 0.6692, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.48381307084576924, + "learning_rate": 0.00014590641095033787, + "loss": 0.7754, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.45713833522429165, + "learning_rate": 0.00014575282208974702, + "loss": 0.7679, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.4131366758433008, + "learning_rate": 0.00014559909660428468, + "loss": 0.679, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.38089852736252117, + "learning_rate": 0.00014544523495299842, + "loss": 0.7627, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.3699847132301396, + "learning_rate": 0.00014529123759534255, + "loss": 0.645, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.3696014091782039, + "learning_rate": 0.00014513710499117647, + "loss": 0.7249, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5056206052640094, + "learning_rate": 0.0001449828376007636, + "loss": 0.8714, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.539932145998434, + "learning_rate": 0.00014482843588476974, + "loss": 0.7293, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.3700863336476834, + "learning_rate": 0.00014467390030426186, + "loss": 0.6846, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.42855540848732004, + "learning_rate": 0.0001445192313207067, + "loss": 0.6818, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.4371717624109415, + "learning_rate": 0.0001443644293959693, + "loss": 0.7209, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.4705215508951361, + "learning_rate": 0.00014420949499231172, + "loss": 0.7311, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4458159431732122, + "learning_rate": 0.0001440544285723915, + "loss": 0.7703, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.3917851894183172, + "learning_rate": 0.00014389923059926062, + "loss": 0.6679, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.3580350158463694, + "learning_rate": 0.0001437439015363638, + "loss": 0.6806, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.5055093024285995, + "learning_rate": 0.00014358844184753712, + "loss": 0.8129, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.3572657850969502, + "learning_rate": 0.00014343285199700683, + "loss": 0.646, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.35043563922430143, + "learning_rate": 0.0001432771324493879, + "loss": 0.6541, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.46945371784425727, + "learning_rate": 0.00014312128366968243, + "loss": 0.8082, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.3687691495275247, + "learning_rate": 0.00014296530612327863, + "loss": 0.6711, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.40667822947642723, + "learning_rate": 0.00014280920027594907, + "loss": 0.7518, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.43797146761055344, + "learning_rate": 0.00014265296659384956, + "loss": 0.7835, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.40210353606465987, + "learning_rate": 0.00014249660554351752, + "loss": 0.6762, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.4270392278398279, + "learning_rate": 0.00014234011759187083, + "loss": 0.7539, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.392145917174963, + "learning_rate": 0.00014218350320620624, + "loss": 0.7186, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.38864957905158737, + "learning_rate": 0.00014202676285419812, + "loss": 0.7011, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.39289194672110705, + "learning_rate": 0.00014186989700389687, + "loss": 0.7745, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.3708892524080064, + "learning_rate": 0.0001417129061237278, + "loss": 0.6661, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.3585159080195186, + "learning_rate": 0.0001415557906824895, + "loss": 0.6753, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.40272715048225605, + "learning_rate": 0.00014139855114935252, + "loss": 0.7383, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.41579811759066937, + "learning_rate": 0.00014124118799385796, + "loss": 0.7354, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.32887259036258953, + "learning_rate": 0.0001410837016859161, + "loss": 0.5956, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.3603762966046134, + "learning_rate": 0.00014092609269580496, + "loss": 0.6439, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.38779763759157626, + "learning_rate": 0.00014076836149416887, + "loss": 0.6249, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.4327625662612419, + "learning_rate": 0.00014061050855201723, + "loss": 0.7579, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.44107105407560276, + "learning_rate": 0.0001404525343407228, + "loss": 0.7354, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.48944825347676674, + "learning_rate": 0.0001402944393320206, + "loss": 0.7462, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.47187198334662633, + "learning_rate": 0.00014013622399800627, + "loss": 0.7667, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.3811573487925462, + "learning_rate": 0.00013997788881113489, + "loss": 0.729, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.3498926333357584, + "learning_rate": 0.00013981943424421932, + "loss": 0.6226, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.3645109010829484, + "learning_rate": 0.0001396608607704289, + "loss": 0.6822, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.45291067357258424, + "learning_rate": 0.0001395021688632882, + "loss": 0.7952, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.39190670348598583, + "learning_rate": 0.00013934335899667527, + "loss": 0.703, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.38835265989584566, + "learning_rate": 0.00013918443164482046, + "loss": 0.6677, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.40703525351336206, + "learning_rate": 0.000139025387282305, + "loss": 0.805, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.4110630421712686, + "learning_rate": 0.00013886622638405952, + "loss": 0.7354, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.3840940545901596, + "learning_rate": 0.0001387069494253626, + "loss": 0.7197, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.4467381460243381, + "learning_rate": 0.0001385475568818394, + "loss": 0.7054, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.46166401359335174, + "learning_rate": 0.00013838804922946027, + "loss": 0.7335, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.36351310619125765, + "learning_rate": 0.00013822842694453924, + "loss": 0.667, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.38036769676459325, + "learning_rate": 0.0001380686905037327, + "loss": 0.7207, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.41443306808069247, + "learning_rate": 0.00013790884038403795, + "loss": 0.7301, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.3976722254079516, + "learning_rate": 0.00013774887706279165, + "loss": 0.7417, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.42931174429527336, + "learning_rate": 0.0001375888010176686, + "loss": 0.7351, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4231884462170058, + "learning_rate": 0.00013742861272668012, + "loss": 0.7328, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.37324639296983, + "learning_rate": 0.00013726831266817278, + "loss": 0.659, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.3905727457838949, + "learning_rate": 0.00013710790132082692, + "loss": 0.6958, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.40036871893220416, + "learning_rate": 0.00013694737916365517, + "loss": 0.7091, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.38528150187429655, + "learning_rate": 0.00013678674667600102, + "loss": 0.6755, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.3797882884409939, + "learning_rate": 0.00013662600433753745, + "loss": 0.6851, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.44454612812770267, + "learning_rate": 0.00013646515262826552, + "loss": 0.7884, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.4253589924655092, + "learning_rate": 0.00013630419202851284, + "loss": 0.6989, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.47466516275733234, + "learning_rate": 0.00013614312301893223, + "loss": 0.6635, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.4277493972756891, + "learning_rate": 0.0001359819460805001, + "loss": 0.7397, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.4150799867682261, + "learning_rate": 0.00013582066169451535, + "loss": 0.7359, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.4079182840730704, + "learning_rate": 0.0001356592703425976, + "loss": 0.7243, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3961983582766719, + "learning_rate": 0.0001354977725066859, + "loss": 0.7152, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.37186248794663584, + "learning_rate": 0.00013533616866903735, + "loss": 0.6974, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.43562388220908504, + "learning_rate": 0.0001351744593122255, + "loss": 0.721, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4005835205040354, + "learning_rate": 0.00013501264491913906, + "loss": 0.7137, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.3687616415807105, + "learning_rate": 0.00013485072597298038, + "loss": 0.6721, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.4346945852782822, + "learning_rate": 0.00013468870295726398, + "loss": 0.7192, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.40119708767717255, + "learning_rate": 0.0001345265763558152, + "loss": 0.7319, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.40914752470278004, + "learning_rate": 0.00013436434665276865, + "loss": 0.725, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.4515602601088684, + "learning_rate": 0.00013420201433256689, + "loss": 0.7346, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.41088961193235307, + "learning_rate": 0.00013403957987995882, + "loss": 0.7259, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.41329628712878974, + "learning_rate": 0.00013387704377999842, + "loss": 0.6726, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.37944231545414464, + "learning_rate": 0.00013371440651804313, + "loss": 0.6497, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.39338187736531505, + "learning_rate": 0.0001335516685797525, + "loss": 0.6974, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.3584335156406577, + "learning_rate": 0.00013338883045108674, + "loss": 0.6539, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.3996459812552077, + "learning_rate": 0.00013322589261830517, + "loss": 0.7899, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4186247409116387, + "learning_rate": 0.00013306285556796495, + "loss": 0.7389, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.45203679024896604, + "learning_rate": 0.0001328997197869194, + "loss": 0.7771, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.4127072642765617, + "learning_rate": 0.0001327364857623168, + "loss": 0.7293, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.40356005146175844, + "learning_rate": 0.00013257315398159864, + "loss": 0.7156, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.39594502540516324, + "learning_rate": 0.00013240972493249847, + "loss": 0.694, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.40434994474435, + "learning_rate": 0.0001322461991030402, + "loss": 0.6816, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.4323229255072922, + "learning_rate": 0.00013208257698153677, + "loss": 0.7809, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.3869847380684972, + "learning_rate": 0.00013191885905658872, + "loss": 0.6869, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.4210267062837234, + "learning_rate": 0.0001317550458170826, + "loss": 0.7502, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.4063539287165066, + "learning_rate": 0.00013159113775218964, + "loss": 0.723, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.3740075391352541, + "learning_rate": 0.00013142713535136414, + "loss": 0.6909, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.41440904326048766, + "learning_rate": 0.00013126303910434214, + "loss": 0.6842, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.38046217217993816, + "learning_rate": 0.00013109884950114007, + "loss": 0.6394, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.4247582721688887, + "learning_rate": 0.00013093456703205288, + "loss": 0.683, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.4434915669750256, + "learning_rate": 0.00013077019218765305, + "loss": 0.7719, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4982210512879041, + "learning_rate": 0.00013060572545878875, + "loss": 0.8067, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.371623762692025, + "learning_rate": 0.0001304411673365826, + "loss": 0.658, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.48032846665453766, + "learning_rate": 0.0001302765183124302, + "loss": 0.7838, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.46365591562736086, + "learning_rate": 0.00013011177887799845, + "loss": 0.7915, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.46413594665498614, + "learning_rate": 0.00012994694952522435, + "loss": 0.7562, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.42736336506004996, + "learning_rate": 0.00012978203074631334, + "loss": 0.7377, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3786148512927807, + "learning_rate": 0.00012961702303373795, + "loss": 0.7068, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.4600288748192486, + "learning_rate": 0.00012945192688023624, + "loss": 0.8246, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.4337822021767261, + "learning_rate": 0.0001292867427788104, + "loss": 0.7531, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.3747108498459979, + "learning_rate": 0.00012912147122272523, + "loss": 0.6639, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.3738286018775399, + "learning_rate": 0.00012895611270550666, + "loss": 0.7045, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.3924295584496147, + "learning_rate": 0.0001287906677209403, + "loss": 0.7193, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4617688049197154, + "learning_rate": 0.00012862513676307008, + "loss": 0.7818, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.37537774354945436, + "learning_rate": 0.0001284595203261965, + "loss": 0.6879, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.4054147302983524, + "learning_rate": 0.00012829381890487536, + "loss": 0.6908, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.41882507548900794, + "learning_rate": 0.00012812803299391628, + "loss": 0.717, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.432655112010318, + "learning_rate": 0.00012796216308838117, + "loss": 0.6868, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.389907435925452, + "learning_rate": 0.00012779620968358273, + "loss": 0.7122, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3816531957696866, + "learning_rate": 0.00012763017327508305, + "loss": 0.6609, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.3702862962953432, + "learning_rate": 0.00012746405435869198, + "loss": 0.6818, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.40667358788801045, + "learning_rate": 0.00012729785343046588, + "loss": 0.6645, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.3784239646653406, + "learning_rate": 0.0001271315709867059, + "loss": 0.6443, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.49086708606364227, + "learning_rate": 0.00012696520752395672, + "loss": 0.7911, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.3961628703192048, + "learning_rate": 0.00012679876353900482, + "loss": 0.6759, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.4463250420691135, + "learning_rate": 0.00012663223952887723, + "loss": 0.7735, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.4686810913140729, + "learning_rate": 0.00012646563599083996, + "loss": 0.7573, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.40909977652780205, + "learning_rate": 0.00012629895342239643, + "loss": 0.7469, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.3727309167141433, + "learning_rate": 0.00012613219232128608, + "loss": 0.6745, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.38008510283465036, + "learning_rate": 0.00012596535318548289, + "loss": 0.658, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.4033939911249178, + "learning_rate": 0.0001257984365131938, + "loss": 0.6661, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3718141592194589, + "learning_rate": 0.00012563144280285741, + "loss": 0.6621, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.41062395716625416, + "learning_rate": 0.00012546437255314222, + "loss": 0.694, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.3862113919135583, + "learning_rate": 0.0001252972262629454, + "loss": 0.7508, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.4045790094090187, + "learning_rate": 0.00012513000443139112, + "loss": 0.7028, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.38043534595271034, + "learning_rate": 0.00012496270755782914, + "loss": 0.6517, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.42898380122884394, + "learning_rate": 0.00012479533614183334, + "loss": 0.765, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4195425785194382, + "learning_rate": 0.00012462789068320017, + "loss": 0.7503, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.3797934256521421, + "learning_rate": 0.00012446037168194714, + "loss": 0.7062, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.4054779301842965, + "learning_rate": 0.00012429277963831148, + "loss": 0.7178, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.386901638774086, + "learning_rate": 0.00012412511505274844, + "loss": 0.6683, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.36400702988521083, + "learning_rate": 0.00012395737842592995, + "loss": 0.6829, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.40184726566934065, + "learning_rate": 0.000123789570258743, + "loss": 0.726, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.39365513267463004, + "learning_rate": 0.00012362169105228826, + "loss": 0.676, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.39919769902042423, + "learning_rate": 0.00012345374130787854, + "loss": 0.7201, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.3665210804076896, + "learning_rate": 0.00012328572152703725, + "loss": 0.6659, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.44196580313984124, + "learning_rate": 0.000123117632211497, + "loss": 0.7519, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.4614342280148402, + "learning_rate": 0.00012294947386319794, + "loss": 0.7913, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.3685609530040209, + "learning_rate": 0.0001227812469842864, + "loss": 0.702, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.5376246603998738, + "learning_rate": 0.00012261295207711346, + "loss": 0.7592, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.43015997174916043, + "learning_rate": 0.00012244458964423327, + "loss": 0.7116, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.45254388676147145, + "learning_rate": 0.00012227616018840154, + "loss": 0.7703, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4368478152575418, + "learning_rate": 0.0001221076642125742, + "loss": 0.712, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.40621015655682485, + "learning_rate": 0.00012193910221990581, + "loss": 0.7853, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.45605323048807755, + "learning_rate": 0.00012177047471374807, + "loss": 0.7578, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.4152530436713154, + "learning_rate": 0.00012160178219764837, + "loss": 0.7036, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.4014292071555056, + "learning_rate": 0.0001214330251753481, + "loss": 0.7025, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.36497959792830553, + "learning_rate": 0.00012126420415078132, + "loss": 0.6592, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3880987257987707, + "learning_rate": 0.00012109531962807332, + "loss": 0.7066, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.4352812133149904, + "learning_rate": 0.00012092637211153885, + "loss": 0.6807, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.39263478153811443, + "learning_rate": 0.0001207573621056809, + "loss": 0.6943, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.367181541698975, + "learning_rate": 0.00012058829011518896, + "loss": 0.7061, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.441297066790881, + "learning_rate": 0.00012041915664493761, + "loss": 0.7277, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.464626225780676, + "learning_rate": 0.00012024996219998517, + "loss": 0.7754, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3887689717509452, + "learning_rate": 0.00012008070728557186, + "loss": 0.6797, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.38715143855728307, + "learning_rate": 0.00011991139240711857, + "loss": 0.7329, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.37379709303278474, + "learning_rate": 0.00011974201807022525, + "loss": 0.6773, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4160027753940018, + "learning_rate": 0.00011957258478066931, + "loss": 0.7129, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.3758872943401131, + "learning_rate": 0.00011940309304440433, + "loss": 0.6992, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.4019839763291867, + "learning_rate": 0.00011923354336755835, + "loss": 0.6855, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.5039043350205292, + "learning_rate": 0.00011906393625643244, + "loss": 0.659, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.42677069004798573, + "learning_rate": 0.00011889427221749916, + "loss": 0.7523, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.35839391585223196, + "learning_rate": 0.00011872455175740112, + "loss": 0.6313, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.45502376077565315, + "learning_rate": 0.00011855477538294935, + "loss": 0.6875, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.4431890444060972, + "learning_rate": 0.00011838494360112185, + "loss": 0.7041, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.37562137625609004, + "learning_rate": 0.00011821505691906216, + "loss": 0.6908, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.3817588509796904, + "learning_rate": 0.00011804511584407763, + "loss": 0.6974, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.45904913572049466, + "learning_rate": 0.00011787512088363817, + "loss": 0.8008, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.41227840756214557, + "learning_rate": 0.00011770507254537453, + "loss": 0.7239, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4055746575625792, + "learning_rate": 0.00011753497133707679, + "loss": 0.7047, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.4056965741181116, + "learning_rate": 0.00011736481776669306, + "loss": 0.6877, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.4386878658210948, + "learning_rate": 0.00011719461234232764, + "loss": 0.7164, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.5597734463574393, + "learning_rate": 0.00011702435557223987, + "loss": 0.8012, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.41627029095711515, + "learning_rate": 0.00011685404796484225, + "loss": 0.7141, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.394981353695425, + "learning_rate": 0.00011668369002869912, + "loss": 0.7664, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.4845057923121737, + "learning_rate": 0.00011651328227252517, + "loss": 0.7589, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.42184021473800426, + "learning_rate": 0.00011634282520518383, + "loss": 0.6862, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.3685228332680087, + "learning_rate": 0.00011617231933568578, + "loss": 0.6971, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.48125970533061785, + "learning_rate": 0.00011600176517318741, + "loss": 0.7705, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.38603224204001146, + "learning_rate": 0.00011583116322698935, + "loss": 0.6801, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.40031986426280525, + "learning_rate": 0.00011566051400653486, + "loss": 0.6784, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.382980299985652, + "learning_rate": 0.00011548981802140848, + "loss": 0.6637, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.37255439825452225, + "learning_rate": 0.00011531907578133429, + "loss": 0.6463, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.38972893339230685, + "learning_rate": 0.00011514828779617459, + "loss": 0.6787, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.3879058499146623, + "learning_rate": 0.00011497745457592816, + "loss": 0.6504, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.36745829887480014, + "learning_rate": 0.00011480657663072896, + "loss": 0.648, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.40196670311819926, + "learning_rate": 0.00011463565447084445, + "loss": 0.6694, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.43880697355150333, + "learning_rate": 0.00011446468860667421, + "loss": 0.6748, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.367688992998293, + "learning_rate": 0.00011429367954874819, + "loss": 0.6344, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.4286921521160196, + "learning_rate": 0.0001141226278077254, + "loss": 0.7725, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.4276181045914976, + "learning_rate": 0.00011395153389439233, + "loss": 0.7045, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.4602914646784364, + "learning_rate": 0.00011378039831966134, + "loss": 0.7106, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.43716536637456466, + "learning_rate": 0.00011360922159456928, + "loss": 0.7746, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3626625983037319, + "learning_rate": 0.00011343800423027582, + "loss": 0.6579, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.46860471157021877, + "learning_rate": 0.00011326674673806195, + "loss": 0.764, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.41045660210116464, + "learning_rate": 0.00011309544962932862, + "loss": 0.731, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3752662286080647, + "learning_rate": 0.0001129241134155949, + "loss": 0.6549, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.4224597961900508, + "learning_rate": 0.00011275273860849684, + "loss": 0.775, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.4024833252997957, + "learning_rate": 0.00011258132571978555, + "loss": 0.6596, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.37603725859632336, + "learning_rate": 0.00011240987526132594, + "loss": 0.6945, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.3687870047601706, + "learning_rate": 0.00011223838774509514, + "loss": 0.6411, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.3932754469285646, + "learning_rate": 0.00011206686368318086, + "loss": 0.6972, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.330723604928933, + "learning_rate": 0.00011189530358778005, + "loss": 0.6111, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.3881987511650391, + "learning_rate": 0.00011172370797119712, + "loss": 0.6641, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.37490505625431775, + "learning_rate": 0.00011155207734584263, + "loss": 0.6733, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.3797740850467834, + "learning_rate": 0.00011138041222423177, + "loss": 0.684, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.37245630143840797, + "learning_rate": 0.00011120871311898254, + "loss": 0.6783, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.4368289721232358, + "learning_rate": 0.0001110369805428146, + "loss": 0.8057, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.37089523762776133, + "learning_rate": 0.00011086521500854745, + "loss": 0.6965, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.3837018215606554, + "learning_rate": 0.0001106934170290991, + "loss": 0.6918, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.4107940671971852, + "learning_rate": 0.00011052158711748434, + "loss": 0.7647, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.38062943148129663, + "learning_rate": 0.00011034972578681338, + "loss": 0.6153, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.347440286463666, + "learning_rate": 0.00011017783355029026, + "loss": 0.6966, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.4138494786190947, + "learning_rate": 0.00011000591092121127, + "loss": 0.6617, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.435120628912503, + "learning_rate": 0.00010983395841296348, + "loss": 0.6947, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.5156696038671611, + "learning_rate": 0.0001096619765390232, + "loss": 0.8521, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.40002449111770144, + "learning_rate": 0.00010948996581295436, + "loss": 0.7086, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4584453650081408, + "learning_rate": 0.00010931792674840718, + "loss": 0.7203, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.42971470634919057, + "learning_rate": 0.00010914585985911632, + "loss": 0.6927, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.46160825121281207, + "learning_rate": 0.00010897376565889971, + "loss": 0.753, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.4484481414251196, + "learning_rate": 0.00010880164466165674, + "loss": 0.7451, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.353331022440045, + "learning_rate": 0.00010862949738136681, + "loss": 0.613, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.39814982162243234, + "learning_rate": 0.00010845732433208779, + "loss": 0.6719, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3670872104876671, + "learning_rate": 0.00010828512602795462, + "loss": 0.6711, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.44024385623542284, + "learning_rate": 0.00010811290298317755, + "loss": 0.7356, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.41161265951685283, + "learning_rate": 0.00010794065571204072, + "loss": 0.7264, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.3984442796149138, + "learning_rate": 0.00010776838472890065, + "loss": 0.6644, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.4294764379918491, + "learning_rate": 0.00010759609054818458, + "loss": 0.7437, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.44704886237002023, + "learning_rate": 0.00010742377368438914, + "loss": 0.7411, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.41290555750302516, + "learning_rate": 0.00010725143465207867, + "loss": 0.7867, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.3613358302667413, + "learning_rate": 0.00010707907396588361, + "loss": 0.654, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.361773515899362, + "learning_rate": 0.0001069066921404992, + "loss": 0.7043, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.4082345023649185, + "learning_rate": 0.00010673428969068364, + "loss": 0.7041, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.4135580682589969, + "learning_rate": 0.00010656186713125689, + "loss": 0.7068, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.4394394752224285, + "learning_rate": 0.0001063894249770989, + "loss": 0.6995, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.4332952825333924, + "learning_rate": 0.00010621696374314807, + "loss": 0.7289, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.36654579801605636, + "learning_rate": 0.00010604448394439983, + "loss": 0.662, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.40193997576120905, + "learning_rate": 0.00010587198609590505, + "loss": 0.7351, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.37513486505181953, + "learning_rate": 0.00010569947071276847, + "loss": 0.7507, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.3961975057784801, + "learning_rate": 0.00010552693831014726, + "loss": 0.6518, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.4097332705661171, + "learning_rate": 0.0001053543894032493, + "loss": 0.7519, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.39234400713069595, + "learning_rate": 0.00010518182450733186, + "loss": 0.6576, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.43052107521153943, + "learning_rate": 0.00010500924413769988, + "loss": 0.7585, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.3936263192518126, + "learning_rate": 0.00010483664880970457, + "loss": 0.6704, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.4657507551578899, + "learning_rate": 0.00010466403903874176, + "loss": 0.6868, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.5021464715895197, + "learning_rate": 0.00010449141534025045, + "loss": 0.6723, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.43506222988133986, + "learning_rate": 0.00010431877822971117, + "loss": 0.704, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4156106779609884, + "learning_rate": 0.00010414612822264455, + "loss": 0.7259, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.3957962157850304, + "learning_rate": 0.00010397346583460971, + "loss": 0.666, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.42205766234785436, + "learning_rate": 0.0001038007915812028, + "loss": 0.7411, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.3900808669526344, + "learning_rate": 0.00010362810597805526, + "loss": 0.6336, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.4076171734832918, + "learning_rate": 0.0001034554095408326, + "loss": 0.7344, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.3847244306006347, + "learning_rate": 0.00010328270278523256, + "loss": 0.6885, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.40880811223071983, + "learning_rate": 0.0001031099862269837, + "loss": 0.6969, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.42801078617179544, + "learning_rate": 0.00010293726038184393, + "loss": 0.7446, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.48044160927138174, + "learning_rate": 0.00010276452576559879, + "loss": 0.6577, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.4257807920668605, + "learning_rate": 0.00010259178289406011, + "loss": 0.7137, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.40497149124415505, + "learning_rate": 0.00010241903228306431, + "loss": 0.6872, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.4335781038181838, + "learning_rate": 0.0001022462744484709, + "loss": 0.6696, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3985487513226291, + "learning_rate": 0.00010207350990616107, + "loss": 0.6448, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.4269506200122535, + "learning_rate": 0.00010190073917203589, + "loss": 0.754, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.4301253765909779, + "learning_rate": 0.00010172796276201503, + "loss": 0.708, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3930380897648193, + "learning_rate": 0.0001015551811920351, + "loss": 0.6582, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.41425780092834197, + "learning_rate": 0.00010138239497804804, + "loss": 0.7319, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.4496872681858521, + "learning_rate": 0.00010120960463601976, + "loss": 0.7825, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.429109992059253, + "learning_rate": 0.00010103681068192845, + "loss": 0.7563, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.3533786886319857, + "learning_rate": 0.00010086401363176305, + "loss": 0.6602, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.36720268355982055, + "learning_rate": 0.00010069121400152181, + "loss": 0.6362, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.37055946076736773, + "learning_rate": 0.00010051841230721065, + "loss": 0.7085, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.38241199146178695, + "learning_rate": 0.0001003456090648416, + "loss": 0.6914, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.38049684847414167, + "learning_rate": 0.00010017280479043147, + "loss": 0.7164, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.36780014919222614, + "learning_rate": 0.0001, + "loss": 0.6807, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.4176219481881425, + "learning_rate": 9.982719520956855e-05, + "loss": 0.6884, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.43880841323776304, + "learning_rate": 9.965439093515841e-05, + "loss": 0.7612, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.4083242613195229, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6761, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.4035241252326839, + "learning_rate": 9.930878599847821e-05, + "loss": 0.6413, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.4202181352789713, + "learning_rate": 9.913598636823693e-05, + "loss": 0.7091, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.410067103211802, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6901, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.4316666564424048, + "learning_rate": 9.879039536398024e-05, + "loss": 0.7605, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.5172372427864208, + "learning_rate": 9.861760502195197e-05, + "loss": 0.7423, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.3822087471114855, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6793, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.4211041483634554, + "learning_rate": 9.827203723798498e-05, + "loss": 0.699, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.34983710521200895, + "learning_rate": 9.809926082796415e-05, + "loss": 0.6348, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3910577540068633, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7243, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.3960657703459683, + "learning_rate": 9.775372555152912e-05, + "loss": 0.7049, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.40337191060802974, + "learning_rate": 9.758096771693573e-05, + "loss": 0.6345, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.3703239762196236, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6558, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.3634029551035167, + "learning_rate": 9.723547423440122e-05, + "loss": 0.6327, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.48215601756608845, + "learning_rate": 9.70627396181561e-05, + "loss": 0.745, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3638409558633442, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6507, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.41204323574299007, + "learning_rate": 9.671729721476746e-05, + "loss": 0.7287, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.3852887012348035, + "learning_rate": 9.654459045916743e-05, + "loss": 0.7447, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.3841897472928867, + "learning_rate": 9.637189402194476e-05, + "loss": 0.684, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.40780006190278706, + "learning_rate": 9.619920841879725e-05, + "loss": 0.7232, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.39603259505315236, + "learning_rate": 9.602653416539031e-05, + "loss": 0.6245, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.45406690853388815, + "learning_rate": 9.585387177735547e-05, + "loss": 0.78, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.38904989956359265, + "learning_rate": 9.568122177028884e-05, + "loss": 0.7168, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.4325600684124193, + "learning_rate": 9.550858465974958e-05, + "loss": 0.6561, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3830034699641413, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7008, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.37968275486463826, + "learning_rate": 9.516335119029546e-05, + "loss": 0.6589, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.4138078396366489, + "learning_rate": 9.499075586230013e-05, + "loss": 0.7354, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3678922571932153, + "learning_rate": 9.481817549266817e-05, + "loss": 0.7075, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.3394208426931613, + "learning_rate": 9.464561059675073e-05, + "loss": 0.6508, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.4532313710316155, + "learning_rate": 9.44730616898528e-05, + "loss": 0.6466, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3531050111256949, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6072, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4473101196705385, + "learning_rate": 9.412801390409497e-05, + "loss": 0.7153, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.4056044330445051, + "learning_rate": 9.395551605560018e-05, + "loss": 0.6826, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.4204381386331875, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6903, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.390453552431553, + "learning_rate": 9.361057502290113e-05, + "loss": 0.6792, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.3792222651183991, + "learning_rate": 9.343813286874312e-05, + "loss": 0.6492, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.415249245822204, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7074, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.37759188533924803, + "learning_rate": 9.309330785950086e-05, + "loss": 0.605, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.42878386883350156, + "learning_rate": 9.292092603411641e-05, + "loss": 0.6881, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4605947532728995, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7806, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.4243274933456629, + "learning_rate": 9.257622631561085e-05, + "loss": 0.7261, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.4216238590088995, + "learning_rate": 9.240390945181543e-05, + "loss": 0.6769, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3792720960569066, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6353, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.41176723371050467, + "learning_rate": 9.205934428795929e-05, + "loss": 0.7357, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.3627687163634997, + "learning_rate": 9.188709701682247e-05, + "loss": 0.6817, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.47965484245707873, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7677, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.39258008589673843, + "learning_rate": 9.154267566791223e-05, + "loss": 0.7012, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.35999804522968204, + "learning_rate": 9.137050261863324e-05, + "loss": 0.7255, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3982795020365924, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6431, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.40106859970171016, + "learning_rate": 9.102623434110028e-05, + "loss": 0.7, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.42965692134016903, + "learning_rate": 9.085414014088369e-05, + "loss": 0.7035, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.43705161139558246, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7427, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.3918766943083777, + "learning_rate": 9.051003418704565e-05, + "loss": 0.6308, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.3985622780590428, + "learning_rate": 9.033802346097682e-05, + "loss": 0.6845, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.42672293533483363, + "learning_rate": 9.016604158703654e-05, + "loss": 0.6507, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.4143702320792897, + "learning_rate": 8.999408907878877e-05, + "loss": 0.6889, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.471380879969022, + "learning_rate": 8.982216644970979e-05, + "loss": 0.7823, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.46480075498108553, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7272, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.36976265158301574, + "learning_rate": 8.947841288251568e-05, + "loss": 0.7031, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.42436037298433815, + "learning_rate": 8.930658297090091e-05, + "loss": 0.7187, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.41953791874187923, + "learning_rate": 8.913478499145254e-05, + "loss": 0.741, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.41981610603411884, + "learning_rate": 8.896301945718541e-05, + "loss": 0.7184, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.4190252188169704, + "learning_rate": 8.879128688101749e-05, + "loss": 0.7345, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4184101905417997, + "learning_rate": 8.861958777576827e-05, + "loss": 0.7642, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.3879175502059154, + "learning_rate": 8.844792265415738e-05, + "loss": 0.7347, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.32715542633091615, + "learning_rate": 8.827629202880293e-05, + "loss": 0.6334, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.37958261854030473, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6843, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.38701966625163015, + "learning_rate": 8.793313631681915e-05, + "loss": 0.7068, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.45127483979897365, + "learning_rate": 8.776161225490489e-05, + "loss": 0.7066, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4101133580417039, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6577, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.3870440706258641, + "learning_rate": 8.741867428021446e-05, + "loss": 0.6597, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.4096386184294334, + "learning_rate": 8.724726139150318e-05, + "loss": 0.7066, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.4397059282577514, + "learning_rate": 8.707588658440511e-05, + "loss": 0.7115, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.643330271525409, + "learning_rate": 8.690455037067141e-05, + "loss": 0.6834, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.4134643699688854, + "learning_rate": 8.673325326193806e-05, + "loss": 0.7021, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4091055590321153, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7012, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.3450013466049551, + "learning_rate": 8.639077840543077e-05, + "loss": 0.6151, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.47085742331395813, + "learning_rate": 8.621960168033867e-05, + "loss": 0.7699, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.39515177914325017, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6866, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.43479194884347794, + "learning_rate": 8.587737219227462e-05, + "loss": 0.7113, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.43860089142479186, + "learning_rate": 8.570632045125185e-05, + "loss": 0.6748, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.327668756532882, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6482, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.3769926364366294, + "learning_rate": 8.536434552915556e-05, + "loss": 0.6948, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.4222610092944222, + "learning_rate": 8.519342336927105e-05, + "loss": 0.7279, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.5048788519883153, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7563, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.43196567813218695, + "learning_rate": 8.485171220382545e-05, + "loss": 0.6819, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.4293025052867687, + "learning_rate": 8.468092421866573e-05, + "loss": 0.712, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.42894131592218576, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6954, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.5022696074419618, + "learning_rate": 8.433948599346516e-05, + "loss": 0.681, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.33992536862717254, + "learning_rate": 8.416883677301069e-05, + "loss": 0.6073, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3643096903622327, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6004, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.4516354617724274, + "learning_rate": 8.382768066431425e-05, + "loss": 0.7386, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.4416293943588361, + "learning_rate": 8.36571747948162e-05, + "loss": 0.7443, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.41531808973420403, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7243, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.3935473285416105, + "learning_rate": 8.33163099713009e-05, + "loss": 0.6611, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.3474446139720467, + "learning_rate": 8.31459520351578e-05, + "loss": 0.6343, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.430512345727379, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6738, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.3641433058460538, + "learning_rate": 8.280538765767235e-05, + "loss": 0.6634, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.4136357697412744, + "learning_rate": 8.263518223330697e-05, + "loss": 0.6672, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3808396291501075, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6345, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.3999041628392743, + "learning_rate": 8.22949274546255e-05, + "loss": 0.6915, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.39809173016582916, + "learning_rate": 8.212487911636184e-05, + "loss": 0.6691, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.46464138468826066, + "learning_rate": 8.195488415592238e-05, + "loss": 0.7021, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.4222476443961406, + "learning_rate": 8.178494308093789e-05, + "loss": 0.722, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.4231280382809865, + "learning_rate": 8.161505639887817e-05, + "loss": 0.7378, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4691286662465898, + "learning_rate": 8.144522461705067e-05, + "loss": 0.71, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.41505030023745515, + "learning_rate": 8.127544824259889e-05, + "loss": 0.695, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.38762458314861936, + "learning_rate": 8.110572778250085e-05, + "loss": 0.6926, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5153169192478391, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6735, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.41114335332061647, + "learning_rate": 8.076645663244168e-05, + "loss": 0.6218, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.38947490596295675, + "learning_rate": 8.059690695559568e-05, + "loss": 0.6606, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.4379699189436385, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7664, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.4361131199878347, + "learning_rate": 8.025798192977481e-05, + "loss": 0.7081, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.41313146945672086, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6189, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.5022765303498697, + "learning_rate": 7.991929271442817e-05, + "loss": 0.7532, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.4227556056013937, + "learning_rate": 7.975003780001485e-05, + "loss": 0.687, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.4223792352159664, + "learning_rate": 7.958084335506239e-05, + "loss": 0.7141, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.44012250934714675, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6946, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.426144893073359, + "learning_rate": 7.924263789431912e-05, + "loss": 0.6756, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.38795705651306933, + "learning_rate": 7.907362788846116e-05, + "loss": 0.6604, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.36687962675189395, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6262, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.4227039580651776, + "learning_rate": 7.873579584921869e-05, + "loss": 0.6616, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.39994912985744235, + "learning_rate": 7.856697482465196e-05, + "loss": 0.6565, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.36900012142350147, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6177, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.3751018480152569, + "learning_rate": 7.822952528625191e-05, + "loss": 0.622, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.3883694896917854, + "learning_rate": 7.806089778009421e-05, + "loss": 0.6898, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.3619978615033815, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6017, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.4168897067771285, + "learning_rate": 7.772383981159849e-05, + "loss": 0.6596, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.38026149451722435, + "learning_rate": 7.755541035576677e-05, + "loss": 0.6571, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.42324690565350187, + "learning_rate": 7.738704792288655e-05, + "loss": 0.741, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.38849404207526655, + "learning_rate": 7.721875301571359e-05, + "loss": 0.6486, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.44791957872685967, + "learning_rate": 7.705052613680211e-05, + "loss": 0.7033, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.474403779731019, + "learning_rate": 7.688236778850306e-05, + "loss": 0.769, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.38980569878904037, + "learning_rate": 7.671427847296275e-05, + "loss": 0.6718, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.39144939025603787, + "learning_rate": 7.654625869212146e-05, + "loss": 0.6537, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4139639592906771, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7053, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.3813478824967945, + "learning_rate": 7.6210429741257e-05, + "loss": 0.6919, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.34470477488748674, + "learning_rate": 7.604262157407007e-05, + "loss": 0.6022, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.349710254993917, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6017, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.526494869740576, + "learning_rate": 7.570722036168854e-05, + "loss": 0.8012, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.354436642808636, + "learning_rate": 7.55396283180529e-05, + "loss": 0.6422, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.40255733702132107, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6722, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.3862620472965459, + "learning_rate": 7.520466385816671e-05, + "loss": 0.6753, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.5682438764980138, + "learning_rate": 7.503729244217086e-05, + "loss": 0.8275, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.38896197148966755, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6761, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.36915780047158037, + "learning_rate": 7.470277373705461e-05, + "loss": 0.647, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.4247439920018155, + "learning_rate": 7.453562744685778e-05, + "loss": 0.7008, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3757141023503735, + "learning_rate": 7.43685571971426e-05, + "loss": 0.705, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.380872384717392, + "learning_rate": 7.42015634868062e-05, + "loss": 0.6479, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.3710352051491737, + "learning_rate": 7.403464681451715e-05, + "loss": 0.6877, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.36921679104831656, + "learning_rate": 7.386780767871397e-05, + "loss": 0.7111, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.40081663228508874, + "learning_rate": 7.370104657760361e-05, + "loss": 0.7137, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.43909280291070063, + "learning_rate": 7.353436400916004e-05, + "loss": 0.7445, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.40435688760211064, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7437, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.37014311897723245, + "learning_rate": 7.320123646099519e-05, + "loss": 0.6338, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.3771698597332727, + "learning_rate": 7.303479247604332e-05, + "loss": 0.6736, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.33845853276475274, + "learning_rate": 7.286842901329412e-05, + "loss": 0.5986, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.4146528022896513, + "learning_rate": 7.270214656953415e-05, + "loss": 0.6821, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.4878004732486644, + "learning_rate": 7.253594564130804e-05, + "loss": 0.7403, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.45801342010665835, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7183, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.502859319845073, + "learning_rate": 7.22037903164173e-05, + "loss": 0.7439, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.392322837743993, + "learning_rate": 7.203783691161883e-05, + "loss": 0.6615, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.3751114926081077, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6512, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.3901322696216327, + "learning_rate": 7.170618109512465e-05, + "loss": 0.6758, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.35435486379734865, + "learning_rate": 7.154047967380354e-05, + "loss": 0.5914, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3652790684399189, + "learning_rate": 7.137486323692995e-05, + "loss": 0.5878, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.4087224220507586, + "learning_rate": 7.12093322790597e-05, + "loss": 0.6718, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.3792542895196207, + "learning_rate": 7.104388729449338e-05, + "loss": 0.6574, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.39634558554168403, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6474, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.38772305877199503, + "learning_rate": 7.071325722118963e-05, + "loss": 0.6769, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.41830416685735256, + "learning_rate": 7.054807311976379e-05, + "loss": 0.6715, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.4545498947925662, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6623, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.4042365730812182, + "learning_rate": 7.021796925368667e-05, + "loss": 0.6494, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.35185184266505637, + "learning_rate": 7.005305047477566e-05, + "loss": 0.5844, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.43979189245947675, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6531, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.4105425607854786, + "learning_rate": 6.972348168756983e-05, + "loss": 0.7003, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.44073988871122777, + "learning_rate": 6.955883266341741e-05, + "loss": 0.6725, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.40715286216987917, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6498, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.4168147465359035, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7153, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.34734906442239943, + "learning_rate": 6.906543296794714e-05, + "loss": 0.5973, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.4141761269341916, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6778, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.46154360872865435, + "learning_rate": 6.873696089565786e-05, + "loss": 0.6951, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.39235942831177384, + "learning_rate": 6.85728646486359e-05, + "loss": 0.6164, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.4086673976195869, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6314, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.38699047434733536, + "learning_rate": 6.82449541829174e-05, + "loss": 0.637, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.3864206136275288, + "learning_rate": 6.80811409434113e-05, + "loss": 0.6444, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.6443641593948699, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6711, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.4639491609604134, + "learning_rate": 6.775380089695986e-05, + "loss": 0.7731, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.4338398092876548, + "learning_rate": 6.759027506750158e-05, + "loss": 0.6755, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.4216504465825253, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7517, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.39691900859881774, + "learning_rate": 6.726351423768322e-05, + "loss": 0.6456, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.39944058550910694, + "learning_rate": 6.710028021308061e-05, + "loss": 0.7185, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.37929747161823074, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6129, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.4121689131249372, + "learning_rate": 6.677410738169485e-05, + "loss": 0.6424, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.3811750190715987, + "learning_rate": 6.661116954891328e-05, + "loss": 0.6755, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3789942012694531, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6399, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.43042891265566546, + "learning_rate": 6.62855934819569e-05, + "loss": 0.6859, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.3753988772879043, + "learning_rate": 6.612295622000162e-05, + "loss": 0.6517, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.4132369716226117, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6835, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.39115922751091403, + "learning_rate": 6.579798566743314e-05, + "loss": 0.6629, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.4039177207102229, + "learning_rate": 6.563565334723134e-05, + "loss": 0.6752, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.39008089390378314, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7208, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.3608937832830116, + "learning_rate": 6.531129704273604e-05, + "loss": 0.624, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.43383834658583326, + "learning_rate": 6.514927402701964e-05, + "loss": 0.6717, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.4405346402821102, + "learning_rate": 6.498735508086093e-05, + "loss": 0.742, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.41521861877120464, + "learning_rate": 6.48255406877745e-05, + "loss": 0.7284, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.37123179121419514, + "learning_rate": 6.466383133096267e-05, + "loss": 0.6345, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.38631151315292184, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6942, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.38387266191895614, + "learning_rate": 6.434072965740242e-05, + "loss": 0.6166, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.3582324416184515, + "learning_rate": 6.417933830548467e-05, + "loss": 0.6622, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.39699462890408693, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6902, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.38693347836303943, + "learning_rate": 6.385687698106781e-05, + "loss": 0.6592, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.4052606688475977, + "learning_rate": 6.369580797148718e-05, + "loss": 0.6575, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5729492026539168, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7168, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.45176947535627326, + "learning_rate": 6.337399566246257e-05, + "loss": 0.6704, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.3954642199930997, + "learning_rate": 6.321325332399903e-05, + "loss": 0.669, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.3705010476821673, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6375, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.4110192203534155, + "learning_rate": 6.289209867917312e-05, + "loss": 0.7027, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.49477491154053943, + "learning_rate": 6.273168733182722e-05, + "loss": 0.6808, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.42176395673777783, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7151, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.41520653526248, + "learning_rate": 6.241119898233144e-05, + "loss": 0.6729, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.3758987690503946, + "learning_rate": 6.225112293720836e-05, + "loss": 0.6589, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3961954582396681, + "learning_rate": 6.209115961596208e-05, + "loss": 0.671, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.39901339556539506, + "learning_rate": 6.19313094962673e-05, + "loss": 0.6496, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.4580813497560127, + "learning_rate": 6.177157305546078e-05, + "loss": 0.79, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.46003217150633746, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7883, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.3964260553568906, + "learning_rate": 6.145244311816063e-05, + "loss": 0.6181, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.3807548731983716, + "learning_rate": 6.129305057463741e-05, + "loss": 0.6998, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4360297205208044, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6262, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.3865799540892062, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6751, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.3996530539284033, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.6568, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3727877088707968, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6643, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.3506964916728646, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.633, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.4065845009917423, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.6625, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.40512894680210737, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6539, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.41785748473259554, + "learning_rate": 6.002211118886514e-05, + "loss": 0.6112, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.37645224956069007, + "learning_rate": 5.986377600199371e-05, + "loss": 0.6089, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.38218319419326885, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6297, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.45907322660182703, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.7344, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.36262594700865114, + "learning_rate": 5.938949144798279e-05, + "loss": 0.5988, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.39423976779658926, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6889, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.37613984271554846, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6263, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.4122433587935938, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.7386, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.43667685771125925, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6977, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.3711145671652538, + "learning_rate": 5.860144885064751e-05, + "loss": 0.6362, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.37451275746745327, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.6478, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.3979133670270158, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6893, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.3484883827048639, + "learning_rate": 5.813010299610313e-05, + "loss": 0.6092, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.34144110736344746, + "learning_rate": 5.797323714580192e-05, + "loss": 0.5815, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4301704864516479, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6662, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.4348656862576235, + "learning_rate": 5.765988240812921e-05, + "loss": 0.7228, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.35639601367192286, + "learning_rate": 5.750339445648252e-05, + "loss": 0.6469, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.40792386695573035, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7141, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.40742722168314527, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.6018, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.38447771453604046, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.6091, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.38990919056436035, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6791, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.4344561767186191, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.6499, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.3548734099373113, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.5503, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.38248543772581156, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.657, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.338733827740271, + "learning_rate": 5.625609846363622e-05, + "loss": 0.6066, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.41067698761926447, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.6894, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.4348889769321385, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6455, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.39813712105076304, + "learning_rate": 5.579050500768836e-05, + "loss": 0.6901, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.4265205105745672, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.5998, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4330741219625014, + "learning_rate": 5.54807686792933e-05, + "loss": 0.7161, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.3657696700746177, + "learning_rate": 5.53260996957381e-05, + "loss": 0.6783, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.36644757405630646, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.6734, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.45947345643473103, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7396, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.3489668548183046, + "learning_rate": 5.486289500882355e-05, + "loss": 0.6197, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.3690643783308543, + "learning_rate": 5.47087624046575e-05, + "loss": 0.6344, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.456107720397437, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7215, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.40203323005580555, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.7176, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.38494287502414254, + "learning_rate": 5.424717791025302e-05, + "loss": 0.6035, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.3734995791052284, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6354, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.3936094263134665, + "learning_rate": 5.394013727258254e-05, + "loss": 0.6424, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.44562805874341715, + "learning_rate": 5.378682303724435e-05, + "loss": 0.7226, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.3839119816128122, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6603, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.4483123370581339, + "learning_rate": 5.348060902265871e-05, + "loss": 0.7056, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.3468698326603632, + "learning_rate": 5.332771015781275e-05, + "loss": 0.6133, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4149625705227579, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6367, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.46041886071476645, + "learning_rate": 5.302233099590928e-05, + "loss": 0.7283, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.35143148139417685, + "learning_rate": 5.286985161076029e-05, + "loss": 0.6678, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3556389058456103, + "learning_rate": 5.271751296338823e-05, + "loss": 0.587, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.4266997961251182, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.6836, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.3936042753495862, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6242, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.4231806637101834, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7319, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.41705570276158715, + "learning_rate": 5.210957484346314e-05, + "loss": 0.7147, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.43294963616605125, + "learning_rate": 5.195794670011776e-05, + "loss": 0.6583, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4156204083871984, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6546, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.35747998071692966, + "learning_rate": 5.165512124837344e-05, + "loss": 0.6083, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.3688837870547564, + "learning_rate": 5.150392484425728e-05, + "loss": 0.649, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.40179747105550656, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6527, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.37358230952691235, + "learning_rate": 5.120196693701267e-05, + "loss": 0.6378, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.35890694501042475, + "learning_rate": 5.105120633557634e-05, + "loss": 0.5566, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.49052617470132837, + "learning_rate": 5.090059190266779e-05, + "loss": 0.7281, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.46210192447504694, + "learning_rate": 5.075012408804458e-05, + "loss": 0.689, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.37130683789508834, + "learning_rate": 5.059980334102637e-05, + "loss": 0.6301, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4074895758430878, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6395, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.37935208826673683, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.6514, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.34797199775753, + "learning_rate": 5.014972799220403e-05, + "loss": 0.6002, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.3522962014137995, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6155, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.42253423847456223, + "learning_rate": 4.985042131538545e-05, + "loss": 0.7097, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.3778680512518862, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.6362, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4296025660232695, + "learning_rate": 4.955171365513603e-05, + "loss": 0.7065, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.4062404775585464, + "learning_rate": 4.940258557148765e-05, + "loss": 0.6829, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.4858595019574037, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.7522, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.377448766992822, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6163, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.43079110371344853, + "learning_rate": 4.895610964891923e-05, + "loss": 0.6869, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.39744249709827023, + "learning_rate": 4.880758859890536e-05, + "loss": 0.6974, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.382931377970497, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6295, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.5014922132487974, + "learning_rate": 4.851100554686021e-05, + "loss": 0.8143, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.3613806912915405, + "learning_rate": 4.836294443047088e-05, + "loss": 0.6521, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.46920678429921814, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6624, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.42141869157299844, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.6487, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.3974521208398004, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6627, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.38704435042495616, + "learning_rate": 4.777224634018732e-05, + "loss": 0.722, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.3713696259428589, + "learning_rate": 4.762496061632814e-05, + "loss": 0.6285, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.371014800468022, + "learning_rate": 4.747783129228656e-05, + "loss": 0.6435, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.4024601896130607, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6752, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.34903249981417445, + "learning_rate": 4.718404360058966e-05, + "loss": 0.606, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.40509439359192134, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.6945, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.3694649482760305, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6442, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.39873395808385487, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.6284, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.3407261775615519, + "learning_rate": 4.659836431497563e-05, + "loss": 0.5643, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.41501238449358774, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6191, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.4104495898247997, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6784, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.3807323293291751, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.635, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4595544098150067, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6875, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.39532369727474764, + "learning_rate": 4.586985643347717e-05, + "loss": 0.6348, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.40109321782211227, + "learning_rate": 4.572463804170263e-05, + "loss": 0.6506, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.4037411071068034, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6508, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.44266104893603353, + "learning_rate": 4.543468791472131e-05, + "loss": 0.6878, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.4078006050037638, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.6487, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.41100819450479764, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6743, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.3538887019823367, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.6215, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.4446714112666036, + "learning_rate": 4.485674639850333e-05, + "loss": 0.7301, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.41170644722765004, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7103, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.4605427485758196, + "learning_rate": 4.456876191254582e-05, + "loss": 0.7189, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.4061454260270999, + "learning_rate": 4.442501774383515e-05, + "loss": 0.6497, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3603132790154237, + "learning_rate": 4.428143953045717e-05, + "loss": 0.61, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.4302046479056169, + "learning_rate": 4.413802770115816e-05, + "loss": 0.6986, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.3576314792300624, + "learning_rate": 4.399478268418771e-05, + "loss": 0.7011, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.43070992131470415, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6702, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.3970467815078321, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.684, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.4261021129159375, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.7218, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4204848942396004, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6163, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.41785258652619117, + "learning_rate": 4.328107473805487e-05, + "loss": 0.6481, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.40126603920945875, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.6839, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.39448733805358843, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6515, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.36827460951186997, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.6259, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.42407810403447627, + "learning_rate": 4.271315449981934e-05, + "loss": 0.6963, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.4594114457629619, + "learning_rate": 4.257160104963696e-05, + "loss": 0.6613, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.3810204216023159, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.5945, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.43608545709597685, + "learning_rate": 4.228900904120895e-05, + "loss": 0.6711, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3879651248398124, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6486, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.40586294188569094, + "learning_rate": 4.200710636738189e-05, + "loss": 0.713, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.40727511212981865, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.6809, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.4776969230797926, + "learning_rate": 4.172589639536991e-05, + "loss": 0.7639, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.3578661709379814, + "learning_rate": 4.158555222253771e-05, + "loss": 0.6136, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.39325973556204963, + "learning_rate": 4.14453824841132e-05, + "loss": 0.6406, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.4229079575333293, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6801, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.3923372999138519, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.638, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.4059336587960118, + "learning_rate": 4.102592405835536e-05, + "loss": 0.664, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3956049748736085, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7063, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.37588095935875665, + "learning_rate": 4.074716493968975e-05, + "loss": 0.6065, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.3913508936345303, + "learning_rate": 4.060805057932359e-05, + "loss": 0.6192, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.4283208983155436, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6759, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.43955154453963924, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.6939, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.42282096656173385, + "learning_rate": 4.019177327749822e-05, + "loss": 0.7184, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.395847147798968, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6204, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.3622357159520991, + "learning_rate": 3.991514736790258e-05, + "loss": 0.6509, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.4081229826979606, + "learning_rate": 3.977710334046193e-05, + "loss": 0.6628, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.4219479283383955, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6971, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.39957701543592067, + "learning_rate": 3.950155520139581e-05, + "loss": 0.6504, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.43649174920643524, + "learning_rate": 3.936405191259891e-05, + "loss": 0.6645, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.43141958081496057, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6191, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.6286525667864393, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.7388, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.38817772595331723, + "learning_rate": 3.895263009479534e-05, + "loss": 0.6446, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.412114221055016, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6787, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.4102293474560031, + "learning_rate": 3.867925968395085e-05, + "loss": 0.6781, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.3884376821622926, + "learning_rate": 3.854284894414122e-05, + "loss": 0.6717, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.38493039643014365, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6733, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.43585606362873025, + "learning_rate": 3.82705784324618e-05, + "loss": 0.6728, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.4280181141336135, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.7325, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.38025146419690364, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6435, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.5592834753588812, + "learning_rate": 3.786355617847385e-05, + "loss": 0.6667, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.4520181137303948, + "learning_rate": 3.772825265187802e-05, + "loss": 0.6851, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.39559532065065467, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6669, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.35950931405771885, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.6008, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.43270266536877905, + "learning_rate": 3.732345940279893e-05, + "loss": 0.6514, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.4344914160567954, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6384, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.4212639702085023, + "learning_rate": 3.705453237352227e-05, + "loss": 0.6417, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.3616803952409905, + "learning_rate": 3.692035060534088e-05, + "loss": 0.6676, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.4411224765959474, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6665, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.38343553384175344, + "learning_rate": 3.665255256532638e-05, + "loss": 0.6612, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.32958688063092734, + "learning_rate": 3.651893709317887e-05, + "loss": 0.6135, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.4026249083615412, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6757, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.4048994078001853, + "learning_rate": 3.625227523958252e-05, + "loss": 0.6909, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.41934958724390453, + "learning_rate": 3.611922965442648e-05, + "loss": 0.598, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.42175418099779427, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6259, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.3610517497042293, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.5823, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.4244103644608558, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.6816, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.48352806735759074, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7407, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.4817810985247976, + "learning_rate": 3.545687101972013e-05, + "loss": 0.6564, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.4697099875994108, + "learning_rate": 3.53249759200601e-05, + "loss": 0.7001, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4572944881204827, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6832, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.4525154876758304, + "learning_rate": 3.506176550233863e-05, + "loss": 0.714, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.4471117031442454, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6757, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.44471878985597685, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6996, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.36843211872059467, + "learning_rate": 3.46684052203088e-05, + "loss": 0.6277, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.4441944802513513, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.6573, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.39347093014029866, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6253, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.3952747929672544, + "learning_rate": 3.427680074531113e-05, + "loss": 0.6505, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.3618825221767066, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.6592, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.42388720908295424, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6772, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.3689456209657369, + "learning_rate": 3.388696260183832e-05, + "loss": 0.5967, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.424668041789921, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.662, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3754656171823722, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6066, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.38912284150881926, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.6442, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.3548864565059086, + "learning_rate": 3.336994413891828e-05, + "loss": 0.6006, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3919077760510013, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6277, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.39380410841258484, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.6184, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.45027174851268076, + "learning_rate": 3.298426809706928e-05, + "loss": 0.676, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.40225833642099534, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6047, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.4376514664438265, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.7156, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.41199669510760717, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.6382, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.39735576466240907, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6172, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.3713753826248314, + "learning_rate": 3.234548216567049e-05, + "loss": 0.6646, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.44678575189964775, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.7011, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4602294032193928, + "learning_rate": 3.209137931341143e-05, + "loss": 0.753, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.46650053271103575, + "learning_rate": 3.196463187590929e-05, + "loss": 0.6214, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.40317733613726847, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.5471, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4230092995633492, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6656, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.40055090409544536, + "learning_rate": 3.158561005793402e-05, + "loss": 0.7061, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.4482564049775638, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6839, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.43519562712294285, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6938, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.39940095757081484, + "learning_rate": 3.120842689807468e-05, + "loss": 0.6529, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.4244148756048259, + "learning_rate": 3.108310952230212e-05, + "loss": 0.691, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.44833681532028524, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6188, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.4470535185148768, + "learning_rate": 3.083309253324651e-05, + "loss": 0.6594, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.3796134492665104, + "learning_rate": 3.070839366655215e-05, + "loss": 0.6161, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.4191151668570498, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6627, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.3427507534658426, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.5615, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.43880115078621856, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6187, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.4215481081683841, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6477, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.4355019017233244, + "learning_rate": 3.008801048763914e-05, + "loss": 0.7073, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.4370484357019805, + "learning_rate": 2.996455867635155e-05, + "loss": 0.6765, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.43665035912216993, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6749, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.3958647254636817, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6386, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.41915594147590046, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.614, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.42218803321434667, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6571, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.42335751599875865, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6804, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.4153881746916546, + "learning_rate": 2.922825253307947e-05, + "loss": 0.6362, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.38307782632840287, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6581, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.5095928880331001, + "learning_rate": 2.898450393337977e-05, + "loss": 0.6828, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.3875864408922726, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.6478, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.44345363702168855, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6437, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.34961625306269245, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.5871, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.42627691075353674, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.7225, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4429482812974094, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6531, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.3674344162611938, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.6432, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.4037733321667493, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.617, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3982375894729625, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6214, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.41177143165150104, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.6611, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.4277176910110703, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.6856, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.6910272239324808, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.701, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.4562352924729361, + "learning_rate": 2.753992680872457e-05, + "loss": 0.6448, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.39479950172562867, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6421, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.37918581631614484, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.5989, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.4670561638981511, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.64, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.4052900218399078, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.6371, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.3911527017578945, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6805, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.4202251589495763, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.6421, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.3981793324863265, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.6573, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.412000437682294, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6247, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.42914580210807063, + "learning_rate": 2.647690737490106e-05, + "loss": 0.6525, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.43285055947384987, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.7626, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.4413730077936835, + "learning_rate": 2.6243086879379e-05, + "loss": 0.7046, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.43148966043553777, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.6982, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.4401444936026171, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6598, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.3874622319447385, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6194, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.41266089083924673, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6142, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.3625437521697885, + "learning_rate": 2.566239608465838e-05, + "loss": 0.5642, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4604695011057813, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6657, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.40972133019861046, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6823, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.4084510280537911, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.6042, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.44398935321392907, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7326, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.4456134856463968, + "learning_rate": 2.508725484101684e-05, + "loss": 0.6628, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.43285530385725185, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.6711, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.39350930186551425, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6128, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.40527366985463825, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.6704, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.35349897082768456, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.6075, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.4106361016149464, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6751, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.3891782128191905, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.6061, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.3850547132027167, + "learning_rate": 2.429146201687538e-05, + "loss": 0.6236, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.44083105540317863, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6371, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.38691339636046546, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6539, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.4420340889763627, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.6601, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.3886789068327339, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6234, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.38766142759886435, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.6049, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.3506735481023233, + "learning_rate": 2.361816641743303e-05, + "loss": 0.6461, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.4709768903977795, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7488, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.46595859291323644, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6949, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.37820360769280126, + "learning_rate": 2.328459328616759e-05, + "loss": 0.6135, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.4254358917377633, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6419, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.4207943846089345, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6858, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.35872268946323377, + "learning_rate": 2.295308190543859e-05, + "loss": 0.5893, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3913744045783546, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6081, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.40900627500047226, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.6166, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.3684146909254557, + "learning_rate": 2.262364118471805e-05, + "loss": 0.5835, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.37875513247523523, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6543, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.3912267090240937, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.6753, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.35805656895046295, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.6059, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.39932907661604244, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6439, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.4218337932594207, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6988, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.40449171604535267, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.6424, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.44683141125326076, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6404, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.3546349444314611, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.56, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.41878688706271866, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.6986, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.41241198092673154, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6535, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.4356041886485949, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6944, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.4851579940295009, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.6417, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.4628739332323059, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.7199, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.388260301176067, + "learning_rate": 2.111388852214001e-05, + "loss": 0.6158, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.3809160374646632, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.6552, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.38767748502294097, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6526, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.3828024778783532, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.6361, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.39807422851630386, + "learning_rate": 2.069097260929439e-05, + "loss": 0.6369, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4059228476243633, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6717, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.38967800340072495, + "learning_rate": 2.048093436450603e-05, + "loss": 0.6315, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.35357562122949704, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.5684, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.38258943119478694, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6516, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.4419137450965979, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.6848, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.43770767176075226, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.6657, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4488926601508519, + "learning_rate": 1.995999968955641e-05, + "loss": 0.75, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.3944234714230744, + "learning_rate": 1.985652854842247e-05, + "loss": 0.6434, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.37644829072906166, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.6453, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4044516702226617, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6403, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.3980673569707095, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.6476, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.5475183862366019, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.707, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.3843448951572416, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.637, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.40636938543825235, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.6367, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.39095803152116365, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.6309, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.36194859493401127, + "learning_rate": 1.903740076395151e-05, + "loss": 0.5899, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.48238799708196023, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.697, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.42831654623021886, + "learning_rate": 1.883503039577894e-05, + "loss": 0.6778, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.3290612016576017, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.5184, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.43306775384722507, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.6517, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.37939300241931767, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.6448, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4174859091757688, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.7248, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.42897070111080293, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.6792, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.44468491812494343, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.7148, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.5270997886680009, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7384, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.4003108742784879, + "learning_rate": 1.803526775107217e-05, + "loss": 0.6593, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.39089226418237993, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.6374, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.35194465381329293, + "learning_rate": 1.783776873795994e-05, + "loss": 0.5823, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.34472080784247255, + "learning_rate": 1.773938710748706e-05, + "loss": 0.5776, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.338792768683901, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.5921, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.376611047079288, + "learning_rate": 1.754336106761927e-05, + "loss": 0.5993, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.37387556233721764, + "learning_rate": 1.744571724358789e-05, + "loss": 0.6193, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.4021204250187721, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.6376, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.4083914805445237, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6396, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.3962488022093343, + "learning_rate": 1.715426605184407e-05, + "loss": 0.6472, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.4124834308999355, + "learning_rate": 1.705761004839911e-05, + "loss": 0.6465, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.35479589603691897, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6233, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.4080382065749448, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6461, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.4493104764695561, + "learning_rate": 1.676912926028007e-05, + "loss": 0.6507, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.39692865212200285, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6339, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.4061488300472044, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.682, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.39685722765982656, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.621, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4150782486054149, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6831, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.3752175206795003, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.5972, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.42912100246172646, + "learning_rate": 1.619888594394382e-05, + "loss": 0.6303, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.411690212726029, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6656, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.4642829889474915, + "learning_rate": 1.601080376443763e-05, + "loss": 0.6909, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.4709376800450816, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.6764, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.38226574602583446, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6403, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.44060871721643696, + "learning_rate": 1.573056222621453e-05, + "loss": 0.718, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.37380761264975537, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.6535, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3917189288001096, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.634, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.43458844590345663, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.6715, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.37911668947364086, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6382, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.3793806945062769, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6328, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.43584474096730086, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6113, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.4109297686822174, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.6478, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4063417624152271, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7134, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.3709854276766787, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.6751, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.3999953238741327, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.6367, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.34047078430130645, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.593, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.37818008434984063, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.6541, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.3883142550297361, + "learning_rate": 1.454244833620102e-05, + "loss": 0.5832, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.3607911014474159, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6276, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.3862760347704396, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.6119, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.36933347067867883, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.5914, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.36033685441672925, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6303, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.43306524792713263, + "learning_rate": 1.409693244743192e-05, + "loss": 0.6972, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.3996021976393818, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.629, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3963386564272573, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6189, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.36919022205639285, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.6503, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.34735951895870093, + "learning_rate": 1.37451354812416e-05, + "loss": 0.5938, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4077481894373184, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6124, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.36925086630683907, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.6419, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.41118593099557654, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.6309, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3822208135731575, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6795, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.4215040066887796, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6591, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.38502469192610717, + "learning_rate": 1.322517230541096e-05, + "loss": 0.6466, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.3805668773253161, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6322, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.43267277488087297, + "learning_rate": 1.30539214797198e-05, + "loss": 0.6539, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.3841521427387812, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.6485, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.38780750587337015, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6028, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.48541801549351904, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.6351, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.42295674367595737, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.6728, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3900700328891032, + "learning_rate": 1.263034245443473e-05, + "loss": 0.651, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.326400083531713, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.5693, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.38358760875407694, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6231, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3826742526760289, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6546, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.4025249079908315, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.6202, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.3541133912664888, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.6388, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3815883631911988, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6464, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.4534226299418841, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.7443, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.4119821518537961, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.6913, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.3924157628562887, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6496, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.36369766672205334, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.6482, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.3787176470885485, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.6009, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4435549594830049, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6869, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.4346278720040936, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.6602, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.4482220380875183, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.6344, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.48419033078342644, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7642, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.36179924130776503, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.5854, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.39538725881852815, + "learning_rate": 1.123914688596409e-05, + "loss": 0.6469, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.4208253894034653, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6222, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.3659376261736916, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.5881, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.418286974140492, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.6523, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.4016537477945244, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.5902, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.34525462758911396, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.5599, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.42721498656573004, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.666, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.3802210622399738, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6078, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.3598703676282435, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.6005, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.3994187417531796, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.6003, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3684169799709176, + "learning_rate": 1.045650195232819e-05, + "loss": 0.612, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.39500917760935933, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.6579, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.41204006075109184, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.6577, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.3798700478362599, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6043, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.38671422024474944, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.6305, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.44027807270041436, + "learning_rate": 1.007519208596045e-05, + "loss": 0.7012, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4621190002468634, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6052, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.3891856313705041, + "learning_rate": 9.924546254786493e-06, + "loss": 0.6663, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.3523057343628837, + "learning_rate": 9.849626695403324e-06, + "loss": 0.5846, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.41956867817442073, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6058, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.5114192626033517, + "learning_rate": 9.700595407649805e-06, + "loss": 0.7443, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.43309849127406347, + "learning_rate": 9.62648412430951e-06, + "loss": 0.7187, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.41631454039150007, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6188, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.47788207709264857, + "learning_rate": 9.479071385238892e-06, + "loss": 0.7718, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.3922934070751977, + "learning_rate": 9.40577036970538e-06, + "loss": 0.5966, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.408553915368954, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6195, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.4285867334060612, + "learning_rate": 9.259980141081115e-06, + "loss": 0.665, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.3859374780012439, + "learning_rate": 9.187491363342093e-06, + "loss": 0.6101, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3692533490464752, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6655, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.5004821979189422, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6644, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.3678805736593292, + "learning_rate": 8.971652971536148e-06, + "loss": 0.5579, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.4232198324354022, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6853, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.3999592803772727, + "learning_rate": 8.829119474567671e-06, + "loss": 0.6828, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.393928049506523, + "learning_rate": 8.758260995011825e-06, + "loss": 0.6094, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4534043147000788, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6925, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.4238057057903342, + "learning_rate": 8.617361631727138e-06, + "loss": 0.6005, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.6057842523033262, + "learning_rate": 8.547321168745193e-06, + "loss": 0.8187, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.35961587811473195, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6429, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.40056045994994294, + "learning_rate": 8.408059725858719e-06, + "loss": 0.6963, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.4534110862461603, + "learning_rate": 8.338839161809997e-06, + "loss": 0.6415, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3829854967675032, + "learning_rate": 8.269892311900696e-06, + "loss": 0.5945, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.39346706379921986, + "learning_rate": 8.201219382016556e-06, + "loss": 0.623, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.3275204922619166, + "learning_rate": 8.132820577225387e-06, + "loss": 0.6086, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3641486535465027, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6114, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.42017073449067316, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6276, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.38574038783865483, + "learning_rate": 7.929270951805178e-06, + "loss": 0.6052, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.384585616150103, + "learning_rate": 7.861970681683051e-06, + "loss": 0.5766, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.40568424059538666, + "learning_rate": 7.794945549701993e-06, + "loss": 0.6343, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.4034784043026029, + "learning_rate": 7.728195756009204e-06, + "loss": 0.6497, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.45139872187066754, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6593, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.5217615224525861, + "learning_rate": 7.595522979965819e-06, + "loss": 0.8341, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.39419060670680345, + "learning_rate": 7.529600393796232e-06, + "loss": 0.6293, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.33519106053638126, + "learning_rate": 7.463953938275858e-06, + "loss": 0.5672, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.4420634099553051, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.6521, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.45971888432388164, + "learning_rate": 7.333490202478666e-06, + "loss": 0.6387, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.40838075404353097, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6919, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.4379494947546668, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6343, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.45690337243309503, + "learning_rate": 7.1398704525792e-06, + "loss": 0.6831, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4453452943975889, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6643, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.34625404756407663, + "learning_rate": 7.012176770311862e-06, + "loss": 0.5578, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.5162864274893599, + "learning_rate": 6.948746347689183e-06, + "loss": 0.63, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3876031911363762, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.7014, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.38583678426931806, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.6304, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.44534207598066183, + "learning_rate": 6.760123024328624e-06, + "loss": 0.6884, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.36993466736604363, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5922, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.37483799699892645, + "learning_rate": 6.635765971293484e-06, + "loss": 0.603, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.35827519311964573, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.5881, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.3550182360212827, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6339, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.36409601536960257, + "learning_rate": 6.451321849032288e-06, + "loss": 0.5744, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.3762401812448924, + "learning_rate": 6.390398932093555e-06, + "loss": 0.6236, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.424113129594775, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6362, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.41865579496554856, + "learning_rate": 6.269391876739495e-06, + "loss": 0.6299, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.390327087115672, + "learning_rate": 6.209308099669597e-06, + "loss": 0.59, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3943126666647335, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6667, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.39696441375321984, + "learning_rate": 6.089980943839924e-06, + "loss": 0.6875, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.4066980964646866, + "learning_rate": 6.030737921409169e-06, + "loss": 0.6633, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3730617767279351, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6188, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.3786613638622962, + "learning_rate": 5.913093872058528e-06, + "loss": 0.5944, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.3920315764100113, + "learning_rate": 5.854693196441641e-06, + "loss": 0.5792, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.3951793563281852, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6323, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.40738210139455694, + "learning_rate": 5.738735415290642e-06, + "loss": 0.5666, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.4327654679559294, + "learning_rate": 5.681178656024055e-06, + "loss": 0.6903, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.3866218525119125, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6275, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.41676921894489855, + "learning_rate": 5.566910259474289e-06, + "loss": 0.6185, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.36819851050984187, + "learning_rate": 5.510198963413881e-06, + "loss": 0.6003, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4289014562150203, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6426, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.3788170827299631, + "learning_rate": 5.397623022464226e-06, + "loss": 0.6143, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.3865590657220277, + "learning_rate": 5.341758713743828e-06, + "loss": 0.6638, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.4365607959056366, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6938, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.44866874502860077, + "learning_rate": 5.230878253907912e-06, + "loss": 0.6426, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.4296859079499077, + "learning_rate": 5.175862433898282e-06, + "loss": 0.7046, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.41822921407633734, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6482, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.38099014633383893, + "learning_rate": 5.066680435123106e-06, + "loss": 0.6109, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.4004957870733732, + "learning_rate": 5.012514582391592e-06, + "loss": 0.6591, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4331026824841501, + "learning_rate": 4.95863237670956e-06, + "loss": 0.636, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.40954588959020344, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6816, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.37477638008580616, + "learning_rate": 4.851719549248301e-06, + "loss": 0.5575, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.3677464157925147, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6233, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.3830645321769917, + "learning_rate": 4.745943229770122e-06, + "loss": 0.6373, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.4926300639629245, + "learning_rate": 4.693481655885257e-06, + "loss": 0.62, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.40984080239893733, + "learning_rate": 4.641304681730641e-06, + "loss": 0.653, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.4534047509134345, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6164, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.41060612583947487, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6648, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.4140544791594831, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6679, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.44852372679056324, + "learning_rate": 4.435445885824285e-06, + "loss": 0.6977, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.3766048639983215, + "learning_rate": 4.384694230432984e-06, + "loss": 0.6351, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.35319560193664873, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.5554, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.48616138562802547, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.6901, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.3993370068830059, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.7132, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.46021025343425365, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6835, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.3492644325618928, + "learning_rate": 4.135221781914034e-06, + "loss": 0.5862, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.4100686532285506, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.6605, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.4259304835012556, + "learning_rate": 4.037435632986786e-06, + "loss": 0.658, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.4250219131039423, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6988, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.4028103776943648, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.6674, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.3777656548476676, + "learning_rate": 3.892905960127546e-06, + "loss": 0.5894, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.42749975757907166, + "learning_rate": 3.845303192289074e-06, + "loss": 0.6723, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.43216473413347745, + "learning_rate": 3.797987556970495e-06, + "loss": 0.5931, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4902906985721607, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6397, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.3438665823882658, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.5812, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.3658476490393041, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.5835, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.3780446322788056, + "learning_rate": 3.611599153858214e-06, + "loss": 0.595, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.41814618617778465, + "learning_rate": 3.565721283350931e-06, + "loss": 0.6324, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.3901178105595818, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.6639, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.38286272643945235, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6099, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.3865628722920474, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6315, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.5419359515697314, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.5427, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.3899876700160922, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6366, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.3616030779346872, + "learning_rate": 3.296506110302422e-06, + "loss": 0.594, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.4240669327876325, + "learning_rate": 3.252646840332918e-06, + "loss": 0.632, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.44957973793027783, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6851, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.3803640458713458, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.6266, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.4255318402766365, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.5948, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.5052336267324755, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.5732, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.34828984780876193, + "learning_rate": 3.037686613916857e-06, + "loss": 0.5604, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.346773846061091, + "learning_rate": 2.995562691985898e-06, + "loss": 0.6007, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3488831216331237, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.5746, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.353819959211536, + "learning_rate": 2.912183982969385e-06, + "loss": 0.5998, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.4182713925300156, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.6411, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.43154467378249295, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6322, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.4037799248955002, + "learning_rate": 2.789290617426765e-06, + "loss": 0.6531, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.45604664148272145, + "learning_rate": 2.748906571878207e-06, + "loss": 0.66, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.44150611513550986, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6566, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.4490419569292603, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.6412, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.4074018171974381, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.6246, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4435900300223481, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6613, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.3600253066809195, + "learning_rate": 2.551344823532964e-06, + "loss": 0.5667, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.39696343675793055, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.611, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5182802937795672, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.707, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.36385089190171926, + "learning_rate": 2.436298790049363e-06, + "loss": 0.6002, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.4045286031388774, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.6688, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4468725830239745, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6733, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.37429865121265865, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.6916, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.3572136019687655, + "learning_rate": 2.286983355164529e-06, + "loss": 0.6035, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4486284312165138, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6861, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.3553972877760231, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.594, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.4056341604847936, + "learning_rate": 2.178060137750071e-06, + "loss": 0.6248, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.36558159733533085, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6066, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.4061970922594267, + "learning_rate": 2.106905034576112e-06, + "loss": 0.633, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.3811504643296473, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.5499, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.37384507290136076, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6049, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.3973429118899678, + "learning_rate": 2.002365067264289e-06, + "loss": 0.6284, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.47537615922802734, + "learning_rate": 1.968103545249611e-06, + "loss": 0.7432, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.4104457549264363, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.7001, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.4430266939888034, + "learning_rate": 1.900458817025097e-06, + "loss": 0.5947, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.3967446665343882, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.5761, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4258095715960273, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6796, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.34786398810414526, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.5575, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.4275895681198496, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.679, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.5290437124970101, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6052, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.5359355147252487, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.7417, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.37930293748445637, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.5852, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.4633348667320413, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.7025, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.392749110225333, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.6089, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.41487626459388594, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.6547, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.41929688237722923, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6599, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.43257949909434595, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.6243, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.40014876474338457, + "learning_rate": 1.489364501100332e-06, + "loss": 0.6399, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.44297625725316786, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6189, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.391386338820982, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6101, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.40289981420138316, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.5999, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.4726234303201714, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6732, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.3880707836498278, + "learning_rate": 1.344477780953346e-06, + "loss": 0.6831, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.42009141047301407, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.5946, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.31459151848912936, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.4982, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.3808332904925995, + "learning_rate": 1.261080262743297e-06, + "loss": 0.6033, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.3971412010894983, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.6068, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.42138215375453464, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.5968, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.45593227137872594, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.6271, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.4370834348661708, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6853, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.40604479576739955, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6261, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.36332079540179196, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.6037, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.46083204959098173, + "learning_rate": 1.076809502472831e-06, + "loss": 0.636, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4331820455251136, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6008, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.39838234530173705, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6939, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.4029315687366071, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6374, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.36723184689622324, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5428, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.41260438226003665, + "learning_rate": 9.540479264726676e-07, + "loss": 0.593, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.3819459436471279, + "learning_rate": 9.303826211592315e-07, + "loss": 0.597, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.5147018245810597, + "learning_rate": 9.070131527609604e-07, + "loss": 0.7745, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.3781968630865498, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6202, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.40083163294255647, + "learning_rate": 8.611620049653879e-07, + "loss": 0.6093, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.4013706045863398, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6523, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.45426433863489035, + "learning_rate": 8.16495030759501e-07, + "loss": 0.623, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.35118711218518495, + "learning_rate": 7.946057760332193e-07, + "loss": 0.6, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.4139965455148424, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6918, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.4580436016800682, + "learning_rate": 7.517160581569372e-07, + "loss": 0.7366, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.41031093686075293, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6508, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.42894494322204385, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6746, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.4277586928186075, + "learning_rate": 6.896044142100433e-07, + "loss": 0.5941, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.3726116360265131, + "learning_rate": 6.694935631773258e-07, + "loss": 0.6289, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.42678020531448374, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7038, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.41457849722470047, + "learning_rate": 6.301617681886863e-07, + "loss": 0.5832, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.5517044690279252, + "learning_rate": 6.109409416834688e-07, + "loss": 0.6336, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.43149313950848506, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6666, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.3616916487578632, + "learning_rate": 5.733897176325665e-07, + "loss": 0.5937, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.3710368806138515, + "learning_rate": 5.550594322205504e-07, + "loss": 0.5398, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.3560929224051006, + "learning_rate": 5.370261044956971e-07, + "loss": 0.5422, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.3975214060376899, + "learning_rate": 5.192897883082747e-07, + "loss": 0.6349, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.39721197754283144, + "learning_rate": 5.018505366216175e-07, + "loss": 0.658, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.40380837462984304, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6511, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.41516731626442, + "learning_rate": 4.678634341683252e-07, + "loss": 0.6576, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.44443986255157986, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.597, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.46690678254721785, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6744, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.4236247311231795, + "learning_rate": 4.191120373120749e-07, + "loss": 0.6958, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.4218011524784495, + "learning_rate": 4.034562351727389e-07, + "loss": 0.6064, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.4180589874972332, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6633, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.40482637284568534, + "learning_rate": 3.73036907948543e-07, + "loss": 0.6008, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.4127301984330829, + "learning_rate": 3.582734737004101e-07, + "loss": 0.6495, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.43671439107411486, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7455, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.37837694112342424, + "learning_rate": 3.296392843612273e-07, + "loss": 0.6003, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.49685366602989023, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.7369, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.43391209681434595, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6135, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.40530873199447187, + "learning_rate": 2.889203328748424e-07, + "loss": 0.6963, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.4132260658725821, + "learning_rate": 2.759428007315212e-07, + "loss": 0.6623, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.37671125150722135, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6389, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.5129910304033822, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.7407, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.4652504700471368, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.6826, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4363947547988871, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6753, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.4218095754007212, + "learning_rate": 2.15522751523467e-07, + "loss": 0.6141, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.40715115291783716, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.645, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4201761417092367, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6296, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.4182657198519536, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.6972, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.4281553244352119, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.6501, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4197015546849949, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6409, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.41746548528971333, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.6475, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.4194385161714978, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.6729, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.4707694959493561, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6543, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.4540084220958516, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6682, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.4069028356392501, + "learning_rate": 1.170343437301491e-07, + "loss": 0.6074, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.5188126407003398, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6645, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.39146103679729155, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.5945, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.4265384478677555, + "learning_rate": 9.330275400666332e-08, + "loss": 0.6236, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.4159128225073078, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6616, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.41964616900270324, + "learning_rate": 7.8973337634336e-08, + "loss": 0.6243, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.579955875571895, + "learning_rate": 7.225618800222877e-08, + "loss": 0.6825, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4478208448163976, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6358, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.4127808255305365, + "learning_rate": 5.971710613821291e-08, + "loss": 0.6365, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.420118472388387, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6653, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.46142875720538856, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6428, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.4048587926915659, + "learning_rate": 4.314680098592705e-08, + "loss": 0.6279, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.3458469497176294, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.6202, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.4076662738844198, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6159, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.38112266212449186, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.635, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.38108114881334026, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.6266, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3657179361705824, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6321, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.37421511625080134, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.6465, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.4760479839858473, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.6844, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4045266852361633, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6967, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.3868844486384437, + "learning_rate": 9.555535917993297e-09, + "loss": 0.6766, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.3705547133425217, + "learning_rate": 7.315984495548378e-09, + "loss": 0.5429, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.3857037725600411, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6145, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.3690888658164784, + "learning_rate": 3.732667443390181e-09, + "loss": 0.5999, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.37791030559543104, + "learning_rate": 2.388912514017516e-09, + "loss": 0.6228, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.40303393262956455, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.586, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.3804108106985691, + "learning_rate": 5.972299119250125e-10, + "loss": 0.6194, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.37964342880483826, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.5963, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.3654631867371486, + "learning_rate": 0.0, + "loss": 0.6098, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1673623541907456.0, + "train_loss": 0.7114526748339335, + "train_runtime": 29135.8629, + "train_samples_per_second": 1.03, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1673623541907456.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5f3d2c0fab2a37b55c7692981509f5579150eb23 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "o_proj", + "up_proj", + "q_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..20ec7ae0d16b0a97f45a8584a4de91d16ca081c0 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2814d25019d3e9d21a6e924c8125532b4fc62ab211a13d11fdfa494d7378f0aa +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..55e68a7295ebea3d87d13fef6bb0dc75c7d0329d --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8f3c5c81f5a6de3f134118146a9b959c53799922dce1d8318db881a072efa39 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..24fd824dc0a80d79b0c186ddaecfc78b86990c11 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_1_epochs_1_GA_4_lora/trainer_state.json @@ -0,0 +1,6601 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9994666666666666, + "eval_steps": 500, + "global_step": 937, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010666666666666667, + "grad_norm": 0.7150404507746235, + "learning_rate": 6.896551724137932e-06, + "loss": 1.3138, + "step": 1 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.7517230768793969, + "learning_rate": 1.3793103448275863e-05, + "loss": 1.3487, + "step": 2 + }, + { + "epoch": 0.0032, + "grad_norm": 0.6850986826401334, + "learning_rate": 2.0689655172413793e-05, + "loss": 1.2781, + "step": 3 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.6669116276208316, + "learning_rate": 2.7586206896551727e-05, + "loss": 1.2872, + "step": 4 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.5513124829727151, + "learning_rate": 3.4482758620689657e-05, + "loss": 1.1467, + "step": 5 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6839012744129601, + "learning_rate": 4.1379310344827587e-05, + "loss": 1.263, + "step": 6 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.5916540505648946, + "learning_rate": 4.827586206896552e-05, + "loss": 1.1601, + "step": 7 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.5681708162575256, + "learning_rate": 5.517241379310345e-05, + "loss": 1.063, + "step": 8 + }, + { + "epoch": 0.0096, + "grad_norm": 0.9175268922331461, + "learning_rate": 6.206896551724138e-05, + "loss": 1.0006, + "step": 9 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.689533234355985, + "learning_rate": 6.896551724137931e-05, + "loss": 1.0061, + "step": 10 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.6058128511088969, + "learning_rate": 7.586206896551724e-05, + "loss": 0.9672, + "step": 11 + }, + { + "epoch": 0.0128, + "grad_norm": 0.4697515045752619, + "learning_rate": 8.275862068965517e-05, + "loss": 0.9108, + "step": 12 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.5136927014165928, + "learning_rate": 8.96551724137931e-05, + "loss": 1.0057, + "step": 13 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.4050192125844374, + "learning_rate": 9.655172413793105e-05, + "loss": 0.8527, + "step": 14 + }, + { + "epoch": 0.016, + "grad_norm": 0.37227559692849704, + "learning_rate": 0.00010344827586206898, + "loss": 0.913, + "step": 15 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.38229389281594744, + "learning_rate": 0.0001103448275862069, + "loss": 0.9282, + "step": 16 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.3914291297898359, + "learning_rate": 0.00011724137931034482, + "loss": 0.8802, + "step": 17 + }, + { + "epoch": 0.0192, + "grad_norm": 0.36356337251375576, + "learning_rate": 0.00012413793103448277, + "loss": 0.8928, + "step": 18 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.40831342927814046, + "learning_rate": 0.00013103448275862068, + "loss": 0.9102, + "step": 19 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.3984609271059379, + "learning_rate": 0.00013793103448275863, + "loss": 0.9177, + "step": 20 + }, + { + "epoch": 0.0224, + "grad_norm": 0.3789014734433653, + "learning_rate": 0.00014482758620689657, + "loss": 0.8836, + "step": 21 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.36284925629079756, + "learning_rate": 0.00015172413793103449, + "loss": 0.8911, + "step": 22 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.3782325896198143, + "learning_rate": 0.00015862068965517243, + "loss": 0.8294, + "step": 23 + }, + { + "epoch": 0.0256, + "grad_norm": 0.35647166395970625, + "learning_rate": 0.00016551724137931035, + "loss": 0.8587, + "step": 24 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.38566819707562905, + "learning_rate": 0.00017241379310344826, + "loss": 0.9287, + "step": 25 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.34689221911048684, + "learning_rate": 0.0001793103448275862, + "loss": 0.8166, + "step": 26 + }, + { + "epoch": 0.0288, + "grad_norm": 0.35163565590631973, + "learning_rate": 0.00018620689655172415, + "loss": 0.8877, + "step": 27 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.3602698831545272, + "learning_rate": 0.0001931034482758621, + "loss": 0.8118, + "step": 28 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.4717780613184218, + "learning_rate": 0.0002, + "loss": 0.9208, + "step": 29 + }, + { + "epoch": 0.032, + "grad_norm": 0.31657851949244914, + "learning_rate": 0.00019999940145388063, + "loss": 0.8136, + "step": 30 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.33567964751978563, + "learning_rate": 0.00019999760582268763, + "loss": 0.7925, + "step": 31 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.6991343865783637, + "learning_rate": 0.00019999461312791638, + "loss": 0.7912, + "step": 32 + }, + { + "epoch": 0.0352, + "grad_norm": 0.3324181376982427, + "learning_rate": 0.0001999904234053922, + "loss": 0.8135, + "step": 33 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.36123274074779893, + "learning_rate": 0.00019998503670526994, + "loss": 0.8447, + "step": 34 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.3332897101654105, + "learning_rate": 0.00019997845309203334, + "loss": 0.808, + "step": 35 + }, + { + "epoch": 0.0384, + "grad_norm": 0.33841161377796475, + "learning_rate": 0.00019997067264449433, + "loss": 0.8057, + "step": 36 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.3433711214041263, + "learning_rate": 0.00019996169545579207, + "loss": 0.8823, + "step": 37 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.3320327339933086, + "learning_rate": 0.00019995152163339178, + "loss": 0.8295, + "step": 38 + }, + { + "epoch": 0.0416, + "grad_norm": 0.35328695696685514, + "learning_rate": 0.00019994015129908346, + "loss": 0.8567, + "step": 39 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.32055484686699165, + "learning_rate": 0.00019992758458898055, + "loss": 0.8269, + "step": 40 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.2988079364489933, + "learning_rate": 0.00019991382165351814, + "loss": 0.8004, + "step": 41 + }, + { + "epoch": 0.0448, + "grad_norm": 0.31076214749417785, + "learning_rate": 0.00019989886265745128, + "loss": 0.7915, + "step": 42 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.36495247754832527, + "learning_rate": 0.00019988270777985292, + "loss": 0.905, + "step": 43 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.3201346595140102, + "learning_rate": 0.00019986535721411186, + "loss": 0.8098, + "step": 44 + }, + { + "epoch": 0.048, + "grad_norm": 0.31431333652249416, + "learning_rate": 0.00019984681116793038, + "loss": 0.8142, + "step": 45 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.3174046054807775, + "learning_rate": 0.00019982706986332175, + "loss": 0.7825, + "step": 46 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.3416574223684478, + "learning_rate": 0.00019980613353660763, + "loss": 0.8033, + "step": 47 + }, + { + "epoch": 0.0512, + "grad_norm": 0.29105704043049, + "learning_rate": 0.00019978400243841508, + "loss": 0.7839, + "step": 48 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.344561985887665, + "learning_rate": 0.00019976067683367385, + "loss": 0.8317, + "step": 49 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.2971171538224461, + "learning_rate": 0.0001997361570016129, + "loss": 0.7912, + "step": 50 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3367067040104621, + "learning_rate": 0.00019971044323575728, + "loss": 0.8412, + "step": 51 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.30951669789643976, + "learning_rate": 0.0001996835358439244, + "loss": 0.8032, + "step": 52 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.31671201739557314, + "learning_rate": 0.00019965543514822062, + "loss": 0.7848, + "step": 53 + }, + { + "epoch": 0.0576, + "grad_norm": 0.2962489428837048, + "learning_rate": 0.00019962614148503718, + "loss": 0.7344, + "step": 54 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.2945631273987315, + "learning_rate": 0.00019959565520504623, + "loss": 0.7634, + "step": 55 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.3133817043715632, + "learning_rate": 0.00019956397667319668, + "loss": 0.7434, + "step": 56 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3106632817607619, + "learning_rate": 0.00019953110626870979, + "loss": 0.8277, + "step": 57 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.33440647782775346, + "learning_rate": 0.00019949704438507459, + "loss": 0.8456, + "step": 58 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.2859149686600027, + "learning_rate": 0.00019946179143004325, + "loss": 0.6912, + "step": 59 + }, + { + "epoch": 0.064, + "grad_norm": 0.3293297422558948, + "learning_rate": 0.0001994253478256262, + "loss": 0.8083, + "step": 60 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.29984639159162674, + "learning_rate": 0.0001993877140080869, + "loss": 0.8117, + "step": 61 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.30330771622679087, + "learning_rate": 0.000199348890427937, + "loss": 0.757, + "step": 62 + }, + { + "epoch": 0.0672, + "grad_norm": 0.31300353466341463, + "learning_rate": 0.00019930887754993044, + "loss": 0.7748, + "step": 63 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.3522154834722942, + "learning_rate": 0.00019926767585305835, + "loss": 0.8176, + "step": 64 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.31586643449288965, + "learning_rate": 0.000199225285830543, + "loss": 0.8185, + "step": 65 + }, + { + "epoch": 0.0704, + "grad_norm": 0.2869955582760285, + "learning_rate": 0.00019918170798983211, + "loss": 0.7617, + "step": 66 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.35210811616977333, + "learning_rate": 0.00019913694285259256, + "loss": 0.791, + "step": 67 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.31706716634256427, + "learning_rate": 0.00019909099095470444, + "loss": 0.8021, + "step": 68 + }, + { + "epoch": 0.0736, + "grad_norm": 0.3339013269934293, + "learning_rate": 0.00019904385284625424, + "loss": 0.8119, + "step": 69 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.31877870971738437, + "learning_rate": 0.00019899552909152866, + "loss": 0.7794, + "step": 70 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.38541610907281726, + "learning_rate": 0.00019894602026900758, + "loss": 0.7312, + "step": 71 + }, + { + "epoch": 0.0768, + "grad_norm": 0.30837800553536693, + "learning_rate": 0.00019889532697135734, + "loss": 0.7584, + "step": 72 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.29549951927182116, + "learning_rate": 0.00019884344980542338, + "loss": 0.7328, + "step": 73 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.32164400658443787, + "learning_rate": 0.00019879038939222329, + "loss": 0.7841, + "step": 74 + }, + { + "epoch": 0.08, + "grad_norm": 0.32324949897356936, + "learning_rate": 0.0001987361463669392, + "loss": 0.7781, + "step": 75 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.3208249120187674, + "learning_rate": 0.00019868072137891002, + "loss": 0.8233, + "step": 76 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.30597753006893685, + "learning_rate": 0.00019862411509162406, + "loss": 0.7677, + "step": 77 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3369333017450298, + "learning_rate": 0.0001985663281827108, + "loss": 0.8387, + "step": 78 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.30688012588113905, + "learning_rate": 0.00019850736134393286, + "loss": 0.7934, + "step": 79 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.36117693414215873, + "learning_rate": 0.00019844721528117766, + "loss": 0.7943, + "step": 80 + }, + { + "epoch": 0.0864, + "grad_norm": 0.3145122076331781, + "learning_rate": 0.00019838589071444903, + "loss": 0.7786, + "step": 81 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.350600304350432, + "learning_rate": 0.00019832338837785863, + "loss": 0.8077, + "step": 82 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.3520435592531511, + "learning_rate": 0.00019825970901961705, + "loss": 0.8227, + "step": 83 + }, + { + "epoch": 0.0896, + "grad_norm": 0.34115733128930387, + "learning_rate": 0.000198194853402025, + "loss": 0.8598, + "step": 84 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.2970478438067611, + "learning_rate": 0.00019812882230146398, + "loss": 0.7553, + "step": 85 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.31146294972146893, + "learning_rate": 0.00019806161650838723, + "loss": 0.8243, + "step": 86 + }, + { + "epoch": 0.0928, + "grad_norm": 0.3435563021382438, + "learning_rate": 0.00019799323682731, + "loss": 0.7977, + "step": 87 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.35481914703830225, + "learning_rate": 0.00019792368407680025, + "loss": 0.847, + "step": 88 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.3255425174480584, + "learning_rate": 0.00019785295908946848, + "loss": 0.8486, + "step": 89 + }, + { + "epoch": 0.096, + "grad_norm": 0.31142872104302455, + "learning_rate": 0.00019778106271195806, + "loss": 0.7739, + "step": 90 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.31099434553696265, + "learning_rate": 0.00019770799580493494, + "loss": 0.7232, + "step": 91 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.33390378245723135, + "learning_rate": 0.00019763375924307735, + "loss": 0.7922, + "step": 92 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3384282157569822, + "learning_rate": 0.0001975583539150655, + "loss": 0.7837, + "step": 93 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.34501336210884226, + "learning_rate": 0.00019748178072357065, + "loss": 0.8115, + "step": 94 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.3414289964926809, + "learning_rate": 0.00019740404058524457, + "loss": 0.7936, + "step": 95 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3268309843645721, + "learning_rate": 0.00019732513443070836, + "loss": 0.7667, + "step": 96 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.34769874408769946, + "learning_rate": 0.00019724506320454153, + "loss": 0.8125, + "step": 97 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.3347227330823757, + "learning_rate": 0.0001971638278652705, + "loss": 0.7706, + "step": 98 + }, + { + "epoch": 0.1056, + "grad_norm": 0.341218992013308, + "learning_rate": 0.0001970814293853572, + "loss": 0.7264, + "step": 99 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.3221983151649732, + "learning_rate": 0.00019699786875118747, + "loss": 0.8102, + "step": 100 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.32357950261485013, + "learning_rate": 0.00019691314696305913, + "loss": 0.759, + "step": 101 + }, + { + "epoch": 0.1088, + "grad_norm": 0.3688826265112733, + "learning_rate": 0.00019682726503517017, + "loss": 0.7623, + "step": 102 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.3157913419017387, + "learning_rate": 0.00019674022399560648, + "loss": 0.7549, + "step": 103 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.3199130056921602, + "learning_rate": 0.00019665202488632956, + "loss": 0.797, + "step": 104 + }, + { + "epoch": 0.112, + "grad_norm": 0.34271816025751367, + "learning_rate": 0.0001965626687631641, + "loss": 0.7837, + "step": 105 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 1.1749314330347793, + "learning_rate": 0.00019647215669578536, + "loss": 0.7909, + "step": 106 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.3047250811610679, + "learning_rate": 0.00019638048976770628, + "loss": 0.7369, + "step": 107 + }, + { + "epoch": 0.1152, + "grad_norm": 0.3471502915554745, + "learning_rate": 0.00019628766907626446, + "loss": 0.8222, + "step": 108 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.31661974043563507, + "learning_rate": 0.00019619369573260924, + "loss": 0.7579, + "step": 109 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.3414034529581211, + "learning_rate": 0.00019609857086168823, + "loss": 0.7917, + "step": 110 + }, + { + "epoch": 0.1184, + "grad_norm": 0.32742097941667897, + "learning_rate": 0.00019600229560223388, + "loss": 0.8256, + "step": 111 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.3132305449791067, + "learning_rate": 0.00019590487110674983, + "loss": 0.7544, + "step": 112 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.3248074990909413, + "learning_rate": 0.0001958062985414972, + "loss": 0.7896, + "step": 113 + }, + { + "epoch": 0.1216, + "grad_norm": 0.31958249201953143, + "learning_rate": 0.00019570657908648048, + "loss": 0.7124, + "step": 114 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.3302381937004531, + "learning_rate": 0.0001956057139354335, + "loss": 0.7946, + "step": 115 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.35980190664873524, + "learning_rate": 0.0001955037042958052, + "loss": 0.8117, + "step": 116 + }, + { + "epoch": 0.1248, + "grad_norm": 0.28062167044982156, + "learning_rate": 0.00019540055138874505, + "loss": 0.7172, + "step": 117 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.36503321665028093, + "learning_rate": 0.00019529625644908847, + "loss": 0.7969, + "step": 118 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.3304952954224699, + "learning_rate": 0.0001951908207253421, + "loss": 0.8017, + "step": 119 + }, + { + "epoch": 0.128, + "grad_norm": 0.3504957437371224, + "learning_rate": 0.00019508424547966884, + "loss": 0.7584, + "step": 120 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.3099650890513928, + "learning_rate": 0.00019497653198787264, + "loss": 0.7527, + "step": 121 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.32179057694112095, + "learning_rate": 0.00019486768153938338, + "loss": 0.7741, + "step": 122 + }, + { + "epoch": 0.1312, + "grad_norm": 0.30322277540797365, + "learning_rate": 0.0001947576954372413, + "loss": 0.7481, + "step": 123 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.2925102959156477, + "learning_rate": 0.00019464657499808152, + "loss": 0.73, + "step": 124 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.3370938546775735, + "learning_rate": 0.0001945343215521182, + "loss": 0.7842, + "step": 125 + }, + { + "epoch": 0.1344, + "grad_norm": 0.33903637260094704, + "learning_rate": 0.0001944209364431286, + "loss": 0.784, + "step": 126 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.3145777113723304, + "learning_rate": 0.00019430642102843707, + "loss": 0.7201, + "step": 127 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.3258583284186643, + "learning_rate": 0.00019419077667889872, + "loss": 0.7463, + "step": 128 + }, + { + "epoch": 0.1376, + "grad_norm": 0.29104955145028755, + "learning_rate": 0.00019407400477888315, + "loss": 0.7396, + "step": 129 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.320537361595748, + "learning_rate": 0.00019395610672625767, + "loss": 0.7645, + "step": 130 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.32773393358493336, + "learning_rate": 0.00019383708393237075, + "loss": 0.776, + "step": 131 + }, + { + "epoch": 0.1408, + "grad_norm": 0.30795208108296096, + "learning_rate": 0.00019371693782203498, + "loss": 0.7558, + "step": 132 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.31080726259715846, + "learning_rate": 0.00019359566983351013, + "loss": 0.7598, + "step": 133 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.30560257019675857, + "learning_rate": 0.0001934732814184859, + "loss": 0.7791, + "step": 134 + }, + { + "epoch": 0.144, + "grad_norm": 0.30215125661680314, + "learning_rate": 0.00019334977404206443, + "loss": 0.7399, + "step": 135 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.31244001707200425, + "learning_rate": 0.00019322514918274308, + "loss": 0.746, + "step": 136 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.30641046306359077, + "learning_rate": 0.00019309940833239626, + "loss": 0.7559, + "step": 137 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3077699271034951, + "learning_rate": 0.00019297255299625797, + "loss": 0.7623, + "step": 138 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.3006958802209959, + "learning_rate": 0.00019284458469290354, + "loss": 0.7601, + "step": 139 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.28871077834134756, + "learning_rate": 0.00019271550495423168, + "loss": 0.7512, + "step": 140 + }, + { + "epoch": 0.1504, + "grad_norm": 0.30479592833280356, + "learning_rate": 0.00019258531532544585, + "loss": 0.7887, + "step": 141 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.29438431708112245, + "learning_rate": 0.00019245401736503608, + "loss": 0.7641, + "step": 142 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.3698190550809041, + "learning_rate": 0.00019232161264475997, + "loss": 0.8312, + "step": 143 + }, + { + "epoch": 0.1536, + "grad_norm": 0.32113905470036225, + "learning_rate": 0.00019218810274962417, + "loss": 0.7608, + "step": 144 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.34743845197123385, + "learning_rate": 0.00019205348927786532, + "loss": 0.7892, + "step": 145 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.3140052946624941, + "learning_rate": 0.00019191777384093081, + "loss": 0.7224, + "step": 146 + }, + { + "epoch": 0.1568, + "grad_norm": 0.31360127159357665, + "learning_rate": 0.0001917809580634596, + "loss": 0.8009, + "step": 147 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.2964427898232695, + "learning_rate": 0.00019164304358326275, + "loss": 0.7203, + "step": 148 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.3235804028497726, + "learning_rate": 0.00019150403205130383, + "loss": 0.7871, + "step": 149 + }, + { + "epoch": 0.16, + "grad_norm": 0.30056816519980334, + "learning_rate": 0.00019136392513167903, + "loss": 0.7468, + "step": 150 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.3002837519853485, + "learning_rate": 0.00019122272450159745, + "loss": 0.7717, + "step": 151 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.31696226829654534, + "learning_rate": 0.0001910804318513609, + "loss": 0.8007, + "step": 152 + }, + { + "epoch": 0.1632, + "grad_norm": 0.30048603236796223, + "learning_rate": 0.0001909370488843436, + "loss": 0.7377, + "step": 153 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.3178311258315863, + "learning_rate": 0.00019079257731697196, + "loss": 0.7642, + "step": 154 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.3607299857080969, + "learning_rate": 0.0001906470188787039, + "loss": 0.7684, + "step": 155 + }, + { + "epoch": 0.1664, + "grad_norm": 0.33000609227428035, + "learning_rate": 0.00019050037531200814, + "loss": 0.7874, + "step": 156 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.32170495531523496, + "learning_rate": 0.00019035264837234347, + "loss": 0.7373, + "step": 157 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.318770211379808, + "learning_rate": 0.00019020383982813765, + "loss": 0.7532, + "step": 158 + }, + { + "epoch": 0.1696, + "grad_norm": 0.31962795313591713, + "learning_rate": 0.00019005395146076616, + "loss": 0.7478, + "step": 159 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.32498100332958324, + "learning_rate": 0.00018990298506453104, + "loss": 0.7682, + "step": 160 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.3122681763794844, + "learning_rate": 0.0001897509424466393, + "loss": 0.7533, + "step": 161 + }, + { + "epoch": 0.1728, + "grad_norm": 0.32638067359801876, + "learning_rate": 0.00018959782542718128, + "loss": 0.792, + "step": 162 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.31987561741947407, + "learning_rate": 0.000189443635839109, + "loss": 0.7518, + "step": 163 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.30041151587312104, + "learning_rate": 0.00018928837552821404, + "loss": 0.7134, + "step": 164 + }, + { + "epoch": 0.176, + "grad_norm": 0.2970901377274106, + "learning_rate": 0.0001891320463531055, + "loss": 0.6934, + "step": 165 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.32022861895985255, + "learning_rate": 0.00018897465018518782, + "loss": 0.7538, + "step": 166 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.30419609260958613, + "learning_rate": 0.0001888161889086383, + "loss": 0.7491, + "step": 167 + }, + { + "epoch": 0.1792, + "grad_norm": 0.31972446389836073, + "learning_rate": 0.00018865666442038456, + "loss": 0.7714, + "step": 168 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.3299470052921253, + "learning_rate": 0.00018849607863008193, + "loss": 0.7506, + "step": 169 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.28517024164539245, + "learning_rate": 0.0001883344334600904, + "loss": 0.7135, + "step": 170 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3224558700250482, + "learning_rate": 0.00018817173084545176, + "loss": 0.7158, + "step": 171 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.30270891189616717, + "learning_rate": 0.0001880079727338664, + "loss": 0.7356, + "step": 172 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.3388519846376595, + "learning_rate": 0.00018784316108566996, + "loss": 0.7896, + "step": 173 + }, + { + "epoch": 0.1856, + "grad_norm": 0.2816035133503061, + "learning_rate": 0.00018767729787380985, + "loss": 0.7268, + "step": 174 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.2981926946358348, + "learning_rate": 0.00018751038508382176, + "loss": 0.7608, + "step": 175 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.3011299811184157, + "learning_rate": 0.00018734242471380572, + "loss": 0.7432, + "step": 176 + }, + { + "epoch": 0.1888, + "grad_norm": 0.32803280465133966, + "learning_rate": 0.00018717341877440226, + "loss": 0.8206, + "step": 177 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.313769001249352, + "learning_rate": 0.0001870033692887684, + "loss": 0.7918, + "step": 178 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.30339604523132696, + "learning_rate": 0.00018683227829255334, + "loss": 0.7099, + "step": 179 + }, + { + "epoch": 0.192, + "grad_norm": 0.3006045830411854, + "learning_rate": 0.00018666014783387408, + "loss": 0.7431, + "step": 180 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.314679925997079, + "learning_rate": 0.000186486979973291, + "loss": 0.7509, + "step": 181 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.33019330905158284, + "learning_rate": 0.0001863127767837831, + "loss": 0.7525, + "step": 182 + }, + { + "epoch": 0.1952, + "grad_norm": 0.31765891033121607, + "learning_rate": 0.0001861375403507233, + "loss": 0.7908, + "step": 183 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.31004466226608485, + "learning_rate": 0.00018596127277185329, + "loss": 0.7735, + "step": 184 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.2877227986417195, + "learning_rate": 0.0001857839761572586, + "loss": 0.6979, + "step": 185 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3345401794189286, + "learning_rate": 0.00018560565262934318, + "loss": 0.7938, + "step": 186 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.31992280535145035, + "learning_rate": 0.00018542630432280422, + "loss": 0.7865, + "step": 187 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.29618585685440163, + "learning_rate": 0.00018524593338460635, + "loss": 0.7257, + "step": 188 + }, + { + "epoch": 0.2016, + "grad_norm": 0.2808123672759863, + "learning_rate": 0.00018506454197395606, + "loss": 0.6761, + "step": 189 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.32676433205976424, + "learning_rate": 0.00018488213226227588, + "loss": 0.7334, + "step": 190 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.3139748760986525, + "learning_rate": 0.0001846987064331783, + "loss": 0.7282, + "step": 191 + }, + { + "epoch": 0.2048, + "grad_norm": 0.32007869336218764, + "learning_rate": 0.00018451426668243963, + "loss": 0.7419, + "step": 192 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.2976081340068327, + "learning_rate": 0.0001843288152179739, + "loss": 0.7606, + "step": 193 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.286432862770985, + "learning_rate": 0.00018414235425980616, + "loss": 0.6929, + "step": 194 + }, + { + "epoch": 0.208, + "grad_norm": 0.2859293393462299, + "learning_rate": 0.00018395488604004603, + "loss": 0.7439, + "step": 195 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.3280792485884969, + "learning_rate": 0.00018376641280286107, + "loss": 0.7909, + "step": 196 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.2950017517796392, + "learning_rate": 0.00018357693680444976, + "loss": 0.6992, + "step": 197 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3010421415813137, + "learning_rate": 0.00018338646031301458, + "loss": 0.738, + "step": 198 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.2871613748711631, + "learning_rate": 0.00018319498560873476, + "loss": 0.7175, + "step": 199 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.3178582558772246, + "learning_rate": 0.00018300251498373923, + "loss": 0.7615, + "step": 200 + }, + { + "epoch": 0.2144, + "grad_norm": 0.3479160766860369, + "learning_rate": 0.00018280905074207884, + "loss": 0.7944, + "step": 201 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.32393029697176035, + "learning_rate": 0.000182614595199699, + "loss": 0.7462, + "step": 202 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.2979591990515668, + "learning_rate": 0.00018241915068441196, + "loss": 0.7283, + "step": 203 + }, + { + "epoch": 0.2176, + "grad_norm": 0.32340201194214113, + "learning_rate": 0.00018222271953586883, + "loss": 0.7416, + "step": 204 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.3342851972955029, + "learning_rate": 0.00018202530410553163, + "loss": 0.7808, + "step": 205 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.33113966231844666, + "learning_rate": 0.00018182690675664514, + "loss": 0.7259, + "step": 206 + }, + { + "epoch": 0.2208, + "grad_norm": 0.31090205478518784, + "learning_rate": 0.00018162752986420868, + "loss": 0.7502, + "step": 207 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.2781076391987126, + "learning_rate": 0.0001814271758149475, + "loss": 0.7376, + "step": 208 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.29378185169500126, + "learning_rate": 0.00018122584700728443, + "loss": 0.7587, + "step": 209 + }, + { + "epoch": 0.224, + "grad_norm": 0.2948693355021739, + "learning_rate": 0.00018102354585131092, + "loss": 0.7272, + "step": 210 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.31430848427810804, + "learning_rate": 0.00018082027476875847, + "loss": 0.7399, + "step": 211 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.323709165255713, + "learning_rate": 0.00018061603619296942, + "loss": 0.7624, + "step": 212 + }, + { + "epoch": 0.2272, + "grad_norm": 0.2968511519545849, + "learning_rate": 0.0001804108325688679, + "loss": 0.7807, + "step": 213 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.2819216924645662, + "learning_rate": 0.00018020466635293057, + "loss": 0.7111, + "step": 214 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.3025250296590155, + "learning_rate": 0.0001799975400131572, + "loss": 0.7606, + "step": 215 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3061912629543229, + "learning_rate": 0.00017978945602904116, + "loss": 0.7556, + "step": 216 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.2827904474986111, + "learning_rate": 0.0001795804168915396, + "loss": 0.7395, + "step": 217 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.28857293213283614, + "learning_rate": 0.00017937042510304392, + "loss": 0.7666, + "step": 218 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3128780749972699, + "learning_rate": 0.00017915948317734942, + "loss": 0.7465, + "step": 219 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.32906725700855655, + "learning_rate": 0.00017894759363962554, + "loss": 0.7839, + "step": 220 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.29731516228232285, + "learning_rate": 0.00017873475902638553, + "loss": 0.7153, + "step": 221 + }, + { + "epoch": 0.2368, + "grad_norm": 0.2928230641997966, + "learning_rate": 0.00017852098188545602, + "loss": 0.7008, + "step": 222 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.2914718534521632, + "learning_rate": 0.00017830626477594654, + "loss": 0.7333, + "step": 223 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.32700146595596147, + "learning_rate": 0.00017809061026821896, + "loss": 0.7801, + "step": 224 + }, + { + "epoch": 0.24, + "grad_norm": 0.2763731273064379, + "learning_rate": 0.00017787402094385666, + "loss": 0.685, + "step": 225 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.2929504436109405, + "learning_rate": 0.00017765649939563365, + "loss": 0.7488, + "step": 226 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.2970130447161082, + "learning_rate": 0.00017743804822748345, + "loss": 0.7703, + "step": 227 + }, + { + "epoch": 0.2432, + "grad_norm": 0.33159059092356574, + "learning_rate": 0.00017721867005446806, + "loss": 0.7384, + "step": 228 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.2742392922768844, + "learning_rate": 0.00017699836750274662, + "loss": 0.6779, + "step": 229 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.2768590001638933, + "learning_rate": 0.00017677714320954378, + "loss": 0.6825, + "step": 230 + }, + { + "epoch": 0.2464, + "grad_norm": 0.32132265567577706, + "learning_rate": 0.00017655499982311847, + "loss": 0.7196, + "step": 231 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.29627429210170386, + "learning_rate": 0.00017633194000273188, + "loss": 0.7544, + "step": 232 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.27463251072949696, + "learning_rate": 0.00017610796641861581, + "loss": 0.6652, + "step": 233 + }, + { + "epoch": 0.2496, + "grad_norm": 0.2932904007753125, + "learning_rate": 0.0001758830817519407, + "loss": 0.7397, + "step": 234 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.28310614670643797, + "learning_rate": 0.00017565728869478337, + "loss": 0.7179, + "step": 235 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.3024834857591237, + "learning_rate": 0.00017543058995009503, + "loss": 0.7529, + "step": 236 + }, + { + "epoch": 0.2528, + "grad_norm": 0.3136320435854579, + "learning_rate": 0.00017520298823166873, + "loss": 0.7364, + "step": 237 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.2907929595807934, + "learning_rate": 0.000174974486264107, + "loss": 0.6681, + "step": 238 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.3131364120157505, + "learning_rate": 0.00017474508678278915, + "loss": 0.7363, + "step": 239 + }, + { + "epoch": 0.256, + "grad_norm": 0.29983111182956595, + "learning_rate": 0.00017451479253383857, + "loss": 0.7326, + "step": 240 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.3207725269511036, + "learning_rate": 0.00017428360627408978, + "loss": 0.701, + "step": 241 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.3861142552859371, + "learning_rate": 0.0001740515307710557, + "loss": 0.7684, + "step": 242 + }, + { + "epoch": 0.2592, + "grad_norm": 0.31487307045532403, + "learning_rate": 0.000173818568802894, + "loss": 0.7591, + "step": 243 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.2958529377111997, + "learning_rate": 0.00017358472315837447, + "loss": 0.7488, + "step": 244 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.3068972347088698, + "learning_rate": 0.00017334999663684504, + "loss": 0.8011, + "step": 245 + }, + { + "epoch": 0.2624, + "grad_norm": 0.28791528208628586, + "learning_rate": 0.00017311439204819874, + "loss": 0.7263, + "step": 246 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.29223396448803846, + "learning_rate": 0.00017287791221283984, + "loss": 0.7267, + "step": 247 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.2666690598809469, + "learning_rate": 0.00017264055996165007, + "loss": 0.6668, + "step": 248 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3309840970310266, + "learning_rate": 0.00017240233813595478, + "loss": 0.7258, + "step": 249 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.2691754319488731, + "learning_rate": 0.000172163249587489, + "loss": 0.7061, + "step": 250 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.29528374904113114, + "learning_rate": 0.00017192329717836315, + "loss": 0.7466, + "step": 251 + }, + { + "epoch": 0.2688, + "grad_norm": 0.33205946639312367, + "learning_rate": 0.00017168248378102892, + "loss": 0.8233, + "step": 252 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.29562064965308293, + "learning_rate": 0.0001714408122782448, + "loss": 0.7121, + "step": 253 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.3012135977466715, + "learning_rate": 0.0001711982855630416, + "loss": 0.7437, + "step": 254 + }, + { + "epoch": 0.272, + "grad_norm": 0.2827803225931734, + "learning_rate": 0.00017095490653868778, + "loss": 0.7008, + "step": 255 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.29491210506131904, + "learning_rate": 0.00017071067811865476, + "loss": 0.7075, + "step": 256 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.3051925897943795, + "learning_rate": 0.000170465603226582, + "loss": 0.7325, + "step": 257 + }, + { + "epoch": 0.2752, + "grad_norm": 0.31857535328982267, + "learning_rate": 0.00017021968479624203, + "loss": 0.768, + "step": 258 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.3009770540923035, + "learning_rate": 0.00016997292577150528, + "loss": 0.7494, + "step": 259 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.2921214285940776, + "learning_rate": 0.0001697253291063049, + "loss": 0.7767, + "step": 260 + }, + { + "epoch": 0.2784, + "grad_norm": 0.29498942574732173, + "learning_rate": 0.0001694768977646013, + "loss": 0.726, + "step": 261 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.2916601404697442, + "learning_rate": 0.00016922763472034685, + "loss": 0.7162, + "step": 262 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.28962661000374823, + "learning_rate": 0.00016897754295745008, + "loss": 0.7053, + "step": 263 + }, + { + "epoch": 0.2816, + "grad_norm": 0.29026392914900373, + "learning_rate": 0.00016872662546974008, + "loss": 0.6746, + "step": 264 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.2989724726717724, + "learning_rate": 0.0001684748852609306, + "loss": 0.7048, + "step": 265 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.29731514656338986, + "learning_rate": 0.00016822232534458416, + "loss": 0.7135, + "step": 266 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3043035518302015, + "learning_rate": 0.00016796894874407595, + "loss": 0.7428, + "step": 267 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.29378761374146967, + "learning_rate": 0.00016771475849255754, + "loss": 0.7102, + "step": 268 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.29062578316610915, + "learning_rate": 0.0001674597576329207, + "loss": 0.7206, + "step": 269 + }, + { + "epoch": 0.288, + "grad_norm": 0.3093426154655176, + "learning_rate": 0.00016720394921776097, + "loss": 0.7584, + "step": 270 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.30174166383165807, + "learning_rate": 0.000166947336309341, + "loss": 0.7342, + "step": 271 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.29456769132732447, + "learning_rate": 0.00016668992197955398, + "loss": 0.7371, + "step": 272 + }, + { + "epoch": 0.2912, + "grad_norm": 0.3297911302830033, + "learning_rate": 0.00016643170930988698, + "loss": 0.7862, + "step": 273 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.30393752220479153, + "learning_rate": 0.00016617270139138371, + "loss": 0.7646, + "step": 274 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.30104010921831503, + "learning_rate": 0.0001659129013246079, + "loss": 0.7163, + "step": 275 + }, + { + "epoch": 0.2944, + "grad_norm": 0.35874325189483025, + "learning_rate": 0.000165652312219606, + "loss": 0.7971, + "step": 276 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.26915126189349153, + "learning_rate": 0.00016539093719586994, + "loss": 0.6894, + "step": 277 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.30440693909445965, + "learning_rate": 0.00016512877938229986, + "loss": 0.7389, + "step": 278 + }, + { + "epoch": 0.2976, + "grad_norm": 0.2795376731367992, + "learning_rate": 0.0001648658419171666, + "loss": 0.6617, + "step": 279 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.27639176627018, + "learning_rate": 0.00016460212794807414, + "loss": 0.7544, + "step": 280 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.28378425437363497, + "learning_rate": 0.00016433764063192194, + "loss": 0.7048, + "step": 281 + }, + { + "epoch": 0.3008, + "grad_norm": 0.28315366675197184, + "learning_rate": 0.00016407238313486712, + "loss": 0.7256, + "step": 282 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.29143112277983785, + "learning_rate": 0.0001638063586322866, + "loss": 0.7477, + "step": 283 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.3109336287354725, + "learning_rate": 0.0001635395703087391, + "loss": 0.7458, + "step": 284 + }, + { + "epoch": 0.304, + "grad_norm": 0.30280188045828377, + "learning_rate": 0.00016327202135792685, + "loss": 0.7217, + "step": 285 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.2854188665742502, + "learning_rate": 0.00016300371498265763, + "loss": 0.7083, + "step": 286 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.33213256633408716, + "learning_rate": 0.00016273465439480618, + "loss": 0.7722, + "step": 287 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3178323971874437, + "learning_rate": 0.000162464842815276, + "loss": 0.7055, + "step": 288 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.28764951172464187, + "learning_rate": 0.00016219428347396053, + "loss": 0.7144, + "step": 289 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.3164517443762592, + "learning_rate": 0.0001619229796097046, + "loss": 0.7692, + "step": 290 + }, + { + "epoch": 0.3104, + "grad_norm": 0.2862059612445801, + "learning_rate": 0.0001616509344702658, + "loss": 0.695, + "step": 291 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.29574185361704974, + "learning_rate": 0.00016137815131227526, + "loss": 0.6988, + "step": 292 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.33822117083312314, + "learning_rate": 0.00016110463340119913, + "loss": 0.8098, + "step": 293 + }, + { + "epoch": 0.3136, + "grad_norm": 0.27989243149301973, + "learning_rate": 0.000160830384011299, + "loss": 0.6968, + "step": 294 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.29850067554920934, + "learning_rate": 0.00016055540642559305, + "loss": 0.6945, + "step": 295 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.2861496386709042, + "learning_rate": 0.00016027970393581666, + "loss": 0.6975, + "step": 296 + }, + { + "epoch": 0.3168, + "grad_norm": 0.2990508342435385, + "learning_rate": 0.00016000327984238292, + "loss": 0.719, + "step": 297 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.28785342962563926, + "learning_rate": 0.00015972613745434314, + "loss": 0.6655, + "step": 298 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.30001158652537224, + "learning_rate": 0.0001594482800893474, + "loss": 0.748, + "step": 299 + }, + { + "epoch": 0.32, + "grad_norm": 0.29072964378791066, + "learning_rate": 0.00015916971107360461, + "loss": 0.7082, + "step": 300 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.29890261400540596, + "learning_rate": 0.00015889043374184286, + "loss": 0.7327, + "step": 301 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.3228171591846049, + "learning_rate": 0.00015861045143726946, + "loss": 0.7367, + "step": 302 + }, + { + "epoch": 0.3232, + "grad_norm": 0.2902748295223263, + "learning_rate": 0.00015832976751153078, + "loss": 0.7129, + "step": 303 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.30275292475490034, + "learning_rate": 0.0001580483853246723, + "loss": 0.7838, + "step": 304 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.2668890914947514, + "learning_rate": 0.0001577663082450984, + "loss": 0.675, + "step": 305 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3040406339357325, + "learning_rate": 0.00015748353964953186, + "loss": 0.687, + "step": 306 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.29342135112984513, + "learning_rate": 0.00015720008292297364, + "loss": 0.7259, + "step": 307 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.28860332771058517, + "learning_rate": 0.00015691594145866215, + "loss": 0.7066, + "step": 308 + }, + { + "epoch": 0.3296, + "grad_norm": 0.31672975257705877, + "learning_rate": 0.00015663111865803285, + "loss": 0.7977, + "step": 309 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.3171367453337259, + "learning_rate": 0.00015634561793067737, + "loss": 0.7414, + "step": 310 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.29429343878253417, + "learning_rate": 0.00015605944269430277, + "loss": 0.6851, + "step": 311 + }, + { + "epoch": 0.3328, + "grad_norm": 0.2833825568619266, + "learning_rate": 0.00015577259637469058, + "loss": 0.7171, + "step": 312 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.31890104033818384, + "learning_rate": 0.00015548508240565583, + "loss": 0.7667, + "step": 313 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.3244450924884595, + "learning_rate": 0.00015519690422900593, + "loss": 0.8027, + "step": 314 + }, + { + "epoch": 0.336, + "grad_norm": 0.30332414268590085, + "learning_rate": 0.00015490806529449945, + "loss": 0.7086, + "step": 315 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.31232628731556034, + "learning_rate": 0.0001546185690598049, + "loss": 0.7693, + "step": 316 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.31348120638983434, + "learning_rate": 0.0001543284189904592, + "loss": 0.7452, + "step": 317 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3118912344088803, + "learning_rate": 0.00015403761855982631, + "loss": 0.7417, + "step": 318 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.30450479497278976, + "learning_rate": 0.00015374617124905564, + "loss": 0.7544, + "step": 319 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.3239981443932927, + "learning_rate": 0.0001534540805470403, + "loss": 0.7603, + "step": 320 + }, + { + "epoch": 0.3424, + "grad_norm": 0.29261333880112783, + "learning_rate": 0.00015316134995037545, + "loss": 0.7358, + "step": 321 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.2885874006177945, + "learning_rate": 0.00015286798296331632, + "loss": 0.7224, + "step": 322 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.29543008226112166, + "learning_rate": 0.00015257398309773633, + "loss": 0.7312, + "step": 323 + }, + { + "epoch": 0.3456, + "grad_norm": 0.2957848212852695, + "learning_rate": 0.00015227935387308511, + "loss": 0.6929, + "step": 324 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.32926130408501947, + "learning_rate": 0.00015198409881634617, + "loss": 0.772, + "step": 325 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.2809669129045362, + "learning_rate": 0.0001516882214619949, + "loss": 0.6751, + "step": 326 + }, + { + "epoch": 0.3488, + "grad_norm": 0.28636906382774335, + "learning_rate": 0.00015139172535195617, + "loss": 0.6989, + "step": 327 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.30016437496912274, + "learning_rate": 0.0001510946140355619, + "loss": 0.7392, + "step": 328 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.2942959821556821, + "learning_rate": 0.00015079689106950854, + "loss": 0.6848, + "step": 329 + }, + { + "epoch": 0.352, + "grad_norm": 0.30165997763270475, + "learning_rate": 0.0001504985600178147, + "loss": 0.7109, + "step": 330 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.3167869656534145, + "learning_rate": 0.00015019962445177819, + "loss": 0.7919, + "step": 331 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.3134628091835621, + "learning_rate": 0.00014990008794993345, + "loss": 0.7248, + "step": 332 + }, + { + "epoch": 0.3552, + "grad_norm": 0.2903622009994375, + "learning_rate": 0.00014959995409800873, + "loss": 0.736, + "step": 333 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.29401512481649594, + "learning_rate": 0.00014929922648888308, + "loss": 0.7384, + "step": 334 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.28172948649947277, + "learning_rate": 0.0001489979087225434, + "loss": 0.6986, + "step": 335 + }, + { + "epoch": 0.3584, + "grad_norm": 0.2850310534609354, + "learning_rate": 0.00014869600440604118, + "loss": 0.7252, + "step": 336 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.2767374226067079, + "learning_rate": 0.00014839351715344968, + "loss": 0.7054, + "step": 337 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.31246054503597304, + "learning_rate": 0.00014809045058582026, + "loss": 0.7466, + "step": 338 + }, + { + "epoch": 0.3616, + "grad_norm": 0.30447686790541406, + "learning_rate": 0.00014778680833113926, + "loss": 0.7072, + "step": 339 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.27477522093219053, + "learning_rate": 0.00014748259402428462, + "loss": 0.6963, + "step": 340 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.29095451951383444, + "learning_rate": 0.00014717781130698212, + "loss": 0.7421, + "step": 341 + }, + { + "epoch": 0.3648, + "grad_norm": 0.30372583676503134, + "learning_rate": 0.00014687246382776205, + "loss": 0.7089, + "step": 342 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.28126555993254804, + "learning_rate": 0.00014656655524191537, + "loss": 0.6845, + "step": 343 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.30767316808921585, + "learning_rate": 0.0001462600892114501, + "loss": 0.733, + "step": 344 + }, + { + "epoch": 0.368, + "grad_norm": 0.31251721417449835, + "learning_rate": 0.00014595306940504716, + "loss": 0.7065, + "step": 345 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.3108818460327434, + "learning_rate": 0.00014564549949801694, + "loss": 0.7124, + "step": 346 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.282923514009023, + "learning_rate": 0.00014533738317225485, + "loss": 0.6908, + "step": 347 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3303366957193466, + "learning_rate": 0.00014502872411619757, + "loss": 0.78, + "step": 348 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.3037307159828496, + "learning_rate": 0.00014471952602477866, + "loss": 0.6914, + "step": 349 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.2916449803269286, + "learning_rate": 0.0001444097925993845, + "loss": 0.6869, + "step": 350 + }, + { + "epoch": 0.3744, + "grad_norm": 0.3138562296402307, + "learning_rate": 0.0001440995275478099, + "loss": 0.7332, + "step": 351 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.2597771712776934, + "learning_rate": 0.0001437887345842137, + "loss": 0.6605, + "step": 352 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.3027758447845734, + "learning_rate": 0.00014347741742907433, + "loss": 0.7191, + "step": 353 + }, + { + "epoch": 0.3776, + "grad_norm": 0.2893071571942425, + "learning_rate": 0.00014316557980914528, + "loss": 0.7116, + "step": 354 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.2797832212193401, + "learning_rate": 0.00014285322545741052, + "loss": 0.6991, + "step": 355 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.3124620780913201, + "learning_rate": 0.0001425403581130398, + "loss": 0.7154, + "step": 356 + }, + { + "epoch": 0.3808, + "grad_norm": 0.2941373724733284, + "learning_rate": 0.00014222698152134374, + "loss": 0.7211, + "step": 357 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.2782094160086249, + "learning_rate": 0.0001419130994337292, + "loss": 0.7214, + "step": 358 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.2804505006644943, + "learning_rate": 0.00014159871560765432, + "loss": 0.6604, + "step": 359 + }, + { + "epoch": 0.384, + "grad_norm": 0.30212878218473016, + "learning_rate": 0.0001412838338065835, + "loss": 0.7252, + "step": 360 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.2702118918080904, + "learning_rate": 0.0001409684577999423, + "loss": 0.6071, + "step": 361 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.2892472891674336, + "learning_rate": 0.00014065259136307242, + "loss": 0.6771, + "step": 362 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3284962152202465, + "learning_rate": 0.0001403362382771865, + "loss": 0.7286, + "step": 363 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.3159813553417371, + "learning_rate": 0.0001400194023293228, + "loss": 0.7421, + "step": 364 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.2680233418669452, + "learning_rate": 0.00013970208731229974, + "loss": 0.6407, + "step": 365 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3696073633336626, + "learning_rate": 0.00013938429702467086, + "loss": 0.7358, + "step": 366 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.286676388334349, + "learning_rate": 0.000139066035270679, + "loss": 0.7204, + "step": 367 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.2839122477820604, + "learning_rate": 0.00013874730586021093, + "loss": 0.7149, + "step": 368 + }, + { + "epoch": 0.3936, + "grad_norm": 0.33071584092148254, + "learning_rate": 0.00013842811260875168, + "loss": 0.7061, + "step": 369 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.27418658262569773, + "learning_rate": 0.0001381084593373389, + "loss": 0.6862, + "step": 370 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.2804854915936261, + "learning_rate": 0.00013778834987251707, + "loss": 0.7242, + "step": 371 + }, + { + "epoch": 0.3968, + "grad_norm": 0.30504085364295, + "learning_rate": 0.00013746778804629177, + "loss": 0.7233, + "step": 372 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.27377761971187475, + "learning_rate": 0.0001371467776960837, + "loss": 0.6684, + "step": 373 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.30178576642031846, + "learning_rate": 0.0001368253226646829, + "loss": 0.684, + "step": 374 + }, + { + "epoch": 0.4, + "grad_norm": 0.29748127767091714, + "learning_rate": 0.00013650342680020258, + "loss": 0.7274, + "step": 375 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.29599308598306373, + "learning_rate": 0.00013618109395603317, + "loss": 0.6669, + "step": 376 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.29901688850136304, + "learning_rate": 0.0001358583279907961, + "loss": 0.7298, + "step": 377 + }, + { + "epoch": 0.4032, + "grad_norm": 0.2800474421972958, + "learning_rate": 0.0001355351327682977, + "loss": 0.7102, + "step": 378 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.292933712030632, + "learning_rate": 0.0001352115121574829, + "loss": 0.703, + "step": 379 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.26604413420465156, + "learning_rate": 0.00013488747003238892, + "loss": 0.6801, + "step": 380 + }, + { + "epoch": 0.4064, + "grad_norm": 0.2984747285500766, + "learning_rate": 0.00013456301027209882, + "loss": 0.7136, + "step": 381 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.3010120016237503, + "learning_rate": 0.00013423813676069534, + "loss": 0.7163, + "step": 382 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.29077295670348746, + "learning_rate": 0.000133912853387214, + "loss": 0.6896, + "step": 383 + }, + { + "epoch": 0.4096, + "grad_norm": 0.2734708039635259, + "learning_rate": 0.0001335871640455968, + "loss": 0.6581, + "step": 384 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.266365248452827, + "learning_rate": 0.00013326107263464558, + "loss": 0.7078, + "step": 385 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.3119455472011162, + "learning_rate": 0.00013293458305797533, + "loss": 0.7513, + "step": 386 + }, + { + "epoch": 0.4128, + "grad_norm": 0.31817694431513605, + "learning_rate": 0.0001326076992239674, + "loss": 0.7097, + "step": 387 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.2718709183308422, + "learning_rate": 0.00013228042504572285, + "loss": 0.6692, + "step": 388 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.2900547973410943, + "learning_rate": 0.00013195276444101547, + "loss": 0.7265, + "step": 389 + }, + { + "epoch": 0.416, + "grad_norm": 0.3022191928802937, + "learning_rate": 0.00013162472133224483, + "loss": 0.7225, + "step": 390 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.2822429057631901, + "learning_rate": 0.0001312962996463896, + "loss": 0.679, + "step": 391 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.27940934892113045, + "learning_rate": 0.00013096750331496033, + "loss": 0.6523, + "step": 392 + }, + { + "epoch": 0.4192, + "grad_norm": 0.31788555354012227, + "learning_rate": 0.0001306383362739523, + "loss": 0.7764, + "step": 393 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.2934418110735407, + "learning_rate": 0.00013030880246379866, + "loss": 0.7124, + "step": 394 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.3165821382129767, + "learning_rate": 0.00012997890582932303, + "loss": 0.7585, + "step": 395 + }, + { + "epoch": 0.4224, + "grad_norm": 0.308246917214587, + "learning_rate": 0.00012964865031969252, + "loss": 0.709, + "step": 396 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.3092407284421468, + "learning_rate": 0.0001293180398883701, + "loss": 0.773, + "step": 397 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.29363551590447556, + "learning_rate": 0.00012898707849306763, + "loss": 0.6796, + "step": 398 + }, + { + "epoch": 0.4256, + "grad_norm": 0.2998652820938399, + "learning_rate": 0.00012865577009569824, + "loss": 0.7374, + "step": 399 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.2783646425542507, + "learning_rate": 0.0001283241186623291, + "loss": 0.6784, + "step": 400 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.29705300656456096, + "learning_rate": 0.00012799212816313376, + "loss": 0.6951, + "step": 401 + }, + { + "epoch": 0.4288, + "grad_norm": 0.26911596672632715, + "learning_rate": 0.00012765980257234473, + "loss": 0.6754, + "step": 402 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.2711186894753625, + "learning_rate": 0.00012732714586820583, + "loss": 0.6634, + "step": 403 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.3062968880528903, + "learning_rate": 0.00012699416203292466, + "loss": 0.7103, + "step": 404 + }, + { + "epoch": 0.432, + "grad_norm": 0.29724964491767264, + "learning_rate": 0.00012666085505262485, + "loss": 0.7129, + "step": 405 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.3276764158913423, + "learning_rate": 0.00012632722891729845, + "loss": 0.7434, + "step": 406 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.2760680245789975, + "learning_rate": 0.000125993287620758, + "loss": 0.6518, + "step": 407 + }, + { + "epoch": 0.4352, + "grad_norm": 0.28691338701577757, + "learning_rate": 0.00012565903516058882, + "loss": 0.6537, + "step": 408 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.30041569017505154, + "learning_rate": 0.00012532447553810126, + "loss": 0.7138, + "step": 409 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.27959002497055496, + "learning_rate": 0.00012498961275828247, + "loss": 0.6669, + "step": 410 + }, + { + "epoch": 0.4384, + "grad_norm": 0.2962430647749866, + "learning_rate": 0.00012465445082974886, + "loss": 0.7467, + "step": 411 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.27405141310923437, + "learning_rate": 0.00012431899376469784, + "loss": 0.7013, + "step": 412 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.2684139536538161, + "learning_rate": 0.00012398324557885994, + "loss": 0.6689, + "step": 413 + }, + { + "epoch": 0.4416, + "grad_norm": 0.2806857537453147, + "learning_rate": 0.0001236472102914506, + "loss": 0.6913, + "step": 414 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.2778467641703905, + "learning_rate": 0.00012331089192512218, + "loss": 0.68, + "step": 415 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.3207849562533483, + "learning_rate": 0.00012297429450591575, + "loss": 0.7562, + "step": 416 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3021635998972281, + "learning_rate": 0.00012263742206321287, + "loss": 0.7177, + "step": 417 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.3092902309614487, + "learning_rate": 0.00012230027862968743, + "loss": 0.7299, + "step": 418 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.3036539312661521, + "learning_rate": 0.00012196286824125726, + "loss": 0.7413, + "step": 419 + }, + { + "epoch": 0.448, + "grad_norm": 0.31927918690705254, + "learning_rate": 0.000121625194937036, + "loss": 0.7218, + "step": 420 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.2884783502399563, + "learning_rate": 0.0001212872627592845, + "loss": 0.6676, + "step": 421 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.28669541960623074, + "learning_rate": 0.00012094907575336267, + "loss": 0.6799, + "step": 422 + }, + { + "epoch": 0.4512, + "grad_norm": 0.2704411179231478, + "learning_rate": 0.0001206106379676809, + "loss": 0.6891, + "step": 423 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.3254286958434096, + "learning_rate": 0.00012027195345365167, + "loss": 0.7372, + "step": 424 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.2731466535433553, + "learning_rate": 0.00011993302626564102, + "loss": 0.6967, + "step": 425 + }, + { + "epoch": 0.4544, + "grad_norm": 0.2807729185976604, + "learning_rate": 0.00011959386046091998, + "loss": 0.6839, + "step": 426 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.2787040118325204, + "learning_rate": 0.00011925446009961607, + "loss": 0.6816, + "step": 427 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.27789017423965373, + "learning_rate": 0.00011891482924466471, + "loss": 0.6918, + "step": 428 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3182433579237445, + "learning_rate": 0.00011857497196176049, + "loss": 0.6531, + "step": 429 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.290394658448927, + "learning_rate": 0.00011823489231930854, + "loss": 0.6952, + "step": 430 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.2983989954278844, + "learning_rate": 0.00011789459438837589, + "loss": 0.7416, + "step": 431 + }, + { + "epoch": 0.4608, + "grad_norm": 0.2887353500957207, + "learning_rate": 0.00011755408224264269, + "loss": 0.7064, + "step": 432 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.28895397897697245, + "learning_rate": 0.00011721335995835336, + "loss": 0.6949, + "step": 433 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.3042478870143878, + "learning_rate": 0.00011687243161426793, + "loss": 0.747, + "step": 434 + }, + { + "epoch": 0.464, + "grad_norm": 0.299282151006539, + "learning_rate": 0.00011653130129161316, + "loss": 0.7504, + "step": 435 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.2949167146250877, + "learning_rate": 0.00011618997307403367, + "loss": 0.6774, + "step": 436 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.3030249400725251, + "learning_rate": 0.00011584845104754304, + "loss": 0.7105, + "step": 437 + }, + { + "epoch": 0.4672, + "grad_norm": 0.29589961404518833, + "learning_rate": 0.00011550673930047498, + "loss": 0.6639, + "step": 438 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.2685570345029132, + "learning_rate": 0.00011516484192343425, + "loss": 0.6516, + "step": 439 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.272942272719098, + "learning_rate": 0.00011482276300924782, + "loss": 0.6465, + "step": 440 + }, + { + "epoch": 0.4704, + "grad_norm": 0.2859207332260473, + "learning_rate": 0.00011448050665291587, + "loss": 0.6676, + "step": 441 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.2866673263857679, + "learning_rate": 0.00011413807695156262, + "loss": 0.6987, + "step": 442 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.32704950699804464, + "learning_rate": 0.00011379547800438747, + "loss": 0.6961, + "step": 443 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3024224748565969, + "learning_rate": 0.00011345271391261584, + "loss": 0.7044, + "step": 444 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.31228993211722267, + "learning_rate": 0.00011310978877945007, + "loss": 0.7366, + "step": 445 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.28716266186260053, + "learning_rate": 0.00011276670671002028, + "loss": 0.7087, + "step": 446 + }, + { + "epoch": 0.4768, + "grad_norm": 0.2715422982460639, + "learning_rate": 0.00011242347181133533, + "loss": 0.6649, + "step": 447 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.27227233928386024, + "learning_rate": 0.00011208008819223354, + "loss": 0.6582, + "step": 448 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.2558079132567336, + "learning_rate": 0.00011173655996333357, + "loss": 0.6352, + "step": 449 + }, + { + "epoch": 0.48, + "grad_norm": 0.274934500937583, + "learning_rate": 0.00011139289123698518, + "loss": 0.6692, + "step": 450 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.28820425858138565, + "learning_rate": 0.00011104908612722001, + "loss": 0.7306, + "step": 451 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.2786676865531545, + "learning_rate": 0.00011070514874970237, + "loss": 0.6813, + "step": 452 + }, + { + "epoch": 0.4832, + "grad_norm": 0.28781836180871956, + "learning_rate": 0.00011036108322167988, + "loss": 0.6828, + "step": 453 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.2628916240402343, + "learning_rate": 0.00011001689366193433, + "loss": 0.6712, + "step": 454 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.3316475169969365, + "learning_rate": 0.00010967258419073217, + "loss": 0.7702, + "step": 455 + }, + { + "epoch": 0.4864, + "grad_norm": 0.29632582000702057, + "learning_rate": 0.00010932815892977535, + "loss": 0.7145, + "step": 456 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.3033796550777103, + "learning_rate": 0.00010898362200215197, + "loss": 0.7128, + "step": 457 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.2812670956871168, + "learning_rate": 0.00010863897753228687, + "loss": 0.673, + "step": 458 + }, + { + "epoch": 0.4896, + "grad_norm": 0.27284272460274583, + "learning_rate": 0.0001082942296458922, + "loss": 0.6663, + "step": 459 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.32150331711989816, + "learning_rate": 0.00010794938246991817, + "loss": 0.7248, + "step": 460 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.3124675705383241, + "learning_rate": 0.0001076044401325036, + "loss": 0.7058, + "step": 461 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3056934528138313, + "learning_rate": 0.00010725940676292636, + "loss": 0.7524, + "step": 462 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.25821166006218316, + "learning_rate": 0.0001069142864915542, + "loss": 0.6728, + "step": 463 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.29667202104180407, + "learning_rate": 0.00010656908344979506, + "loss": 0.6928, + "step": 464 + }, + { + "epoch": 0.496, + "grad_norm": 0.3082632780946926, + "learning_rate": 0.0001062238017700478, + "loss": 0.7028, + "step": 465 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.2732022844493322, + "learning_rate": 0.00010587844558565261, + "loss": 0.6891, + "step": 466 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.2694717190175374, + "learning_rate": 0.00010553301903084157, + "loss": 0.6945, + "step": 467 + }, + { + "epoch": 0.4992, + "grad_norm": 0.29173066404677117, + "learning_rate": 0.00010518752624068911, + "loss": 0.6918, + "step": 468 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.2991188095881304, + "learning_rate": 0.00010484197135106263, + "loss": 0.7075, + "step": 469 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.3266209656618395, + "learning_rate": 0.0001044963584985729, + "loss": 0.6726, + "step": 470 + }, + { + "epoch": 0.5024, + "grad_norm": 0.30257739145875007, + "learning_rate": 0.0001041506918205246, + "loss": 0.7064, + "step": 471 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.2899570070928348, + "learning_rate": 0.00010380497545486663, + "loss": 0.6933, + "step": 472 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.2842242620832916, + "learning_rate": 0.00010345921354014279, + "loss": 0.6764, + "step": 473 + }, + { + "epoch": 0.5056, + "grad_norm": 0.27906256261403156, + "learning_rate": 0.00010311341021544218, + "loss": 0.6855, + "step": 474 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.3159988605982528, + "learning_rate": 0.0001027675696203495, + "loss": 0.6977, + "step": 475 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.30054675574873646, + "learning_rate": 0.00010242169589489568, + "loss": 0.6937, + "step": 476 + }, + { + "epoch": 0.5088, + "grad_norm": 0.28457048083757125, + "learning_rate": 0.00010207579317950827, + "loss": 0.6512, + "step": 477 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.30356306008660766, + "learning_rate": 0.0001017298656149618, + "loss": 0.7225, + "step": 478 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.28317457056646794, + "learning_rate": 0.00010138391734232832, + "loss": 0.6838, + "step": 479 + }, + { + "epoch": 0.512, + "grad_norm": 0.31768004643728465, + "learning_rate": 0.00010103795250292778, + "loss": 0.7606, + "step": 480 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.2674934346224266, + "learning_rate": 0.00010069197523827833, + "loss": 0.6467, + "step": 481 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.2841931782123327, + "learning_rate": 0.00010034598969004705, + "loss": 0.6924, + "step": 482 + }, + { + "epoch": 0.5152, + "grad_norm": 0.2976191946843458, + "learning_rate": 0.0001, + "loss": 0.6877, + "step": 483 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.3045853152591595, + "learning_rate": 9.965401030995301e-05, + "loss": 0.7188, + "step": 484 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.30795860143094705, + "learning_rate": 9.930802476172169e-05, + "loss": 0.6544, + "step": 485 + }, + { + "epoch": 0.5184, + "grad_norm": 0.28724834345193667, + "learning_rate": 9.896204749707228e-05, + "loss": 0.6935, + "step": 486 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.33534202228153537, + "learning_rate": 9.861608265767167e-05, + "loss": 0.7443, + "step": 487 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.29500696251450703, + "learning_rate": 9.827013438503822e-05, + "loss": 0.6852, + "step": 488 + }, + { + "epoch": 0.5216, + "grad_norm": 0.2774665215507267, + "learning_rate": 9.792420682049174e-05, + "loss": 0.6714, + "step": 489 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.27879423976510714, + "learning_rate": 9.757830410510433e-05, + "loss": 0.6628, + "step": 490 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.24864113181101832, + "learning_rate": 9.723243037965056e-05, + "loss": 0.6392, + "step": 491 + }, + { + "epoch": 0.5248, + "grad_norm": 0.29714731482588147, + "learning_rate": 9.688658978455784e-05, + "loss": 0.6926, + "step": 492 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.27833391768791793, + "learning_rate": 9.654078645985722e-05, + "loss": 0.736, + "step": 493 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.2694171559355083, + "learning_rate": 9.619502454513338e-05, + "loss": 0.6975, + "step": 494 + }, + { + "epoch": 0.528, + "grad_norm": 0.29810632742874227, + "learning_rate": 9.584930817947544e-05, + "loss": 0.7008, + "step": 495 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.2948716126782792, + "learning_rate": 9.550364150142713e-05, + "loss": 0.6776, + "step": 496 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.2946985777896799, + "learning_rate": 9.515802864893739e-05, + "loss": 0.6741, + "step": 497 + }, + { + "epoch": 0.5312, + "grad_norm": 0.284247571441035, + "learning_rate": 9.481247375931094e-05, + "loss": 0.7164, + "step": 498 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.27581045832111845, + "learning_rate": 9.446698096915847e-05, + "loss": 0.6428, + "step": 499 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.30837432342203425, + "learning_rate": 9.412155441434741e-05, + "loss": 0.6578, + "step": 500 + }, + { + "epoch": 0.5344, + "grad_norm": 0.2990392539662064, + "learning_rate": 9.377619822995219e-05, + "loss": 0.6824, + "step": 501 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.29584192832682144, + "learning_rate": 9.343091655020495e-05, + "loss": 0.6581, + "step": 502 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.2756642495486731, + "learning_rate": 9.308571350844584e-05, + "loss": 0.6553, + "step": 503 + }, + { + "epoch": 0.5376, + "grad_norm": 0.31625137977434126, + "learning_rate": 9.274059323707366e-05, + "loss": 0.7254, + "step": 504 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.2888163041939396, + "learning_rate": 9.239555986749645e-05, + "loss": 0.6902, + "step": 505 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.2823616564888589, + "learning_rate": 9.205061753008183e-05, + "loss": 0.6761, + "step": 506 + }, + { + "epoch": 0.5408, + "grad_norm": 0.312245375551478, + "learning_rate": 9.170577035410783e-05, + "loss": 0.7174, + "step": 507 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.27329110126372763, + "learning_rate": 9.136102246771314e-05, + "loss": 0.7092, + "step": 508 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.28289621600363557, + "learning_rate": 9.101637799784804e-05, + "loss": 0.6726, + "step": 509 + }, + { + "epoch": 0.544, + "grad_norm": 0.30378125283881025, + "learning_rate": 9.06718410702247e-05, + "loss": 0.7135, + "step": 510 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.27386661465405454, + "learning_rate": 9.032741580926787e-05, + "loss": 0.6595, + "step": 511 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.2917119627165848, + "learning_rate": 8.998310633806571e-05, + "loss": 0.667, + "step": 512 + }, + { + "epoch": 0.5472, + "grad_norm": 0.32680184848419386, + "learning_rate": 8.963891677832011e-05, + "loss": 0.7503, + "step": 513 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.2887453813002438, + "learning_rate": 8.929485125029766e-05, + "loss": 0.703, + "step": 514 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.3123646066338202, + "learning_rate": 8.895091387277999e-05, + "loss": 0.7221, + "step": 515 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3043523712481793, + "learning_rate": 8.860710876301484e-05, + "loss": 0.7392, + "step": 516 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.26371171892204565, + "learning_rate": 8.826344003666647e-05, + "loss": 0.6779, + "step": 517 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.28268771266944587, + "learning_rate": 8.791991180776648e-05, + "loss": 0.6895, + "step": 518 + }, + { + "epoch": 0.5536, + "grad_norm": 0.30496944666575654, + "learning_rate": 8.757652818866471e-05, + "loss": 0.6762, + "step": 519 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.27800056978244325, + "learning_rate": 8.723329328997973e-05, + "loss": 0.6773, + "step": 520 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.3036649295616628, + "learning_rate": 8.689021122054996e-05, + "loss": 0.695, + "step": 521 + }, + { + "epoch": 0.5568, + "grad_norm": 0.2932496959497221, + "learning_rate": 8.654728608738418e-05, + "loss": 0.6949, + "step": 522 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.28868433300841473, + "learning_rate": 8.620452199561254e-05, + "loss": 0.6866, + "step": 523 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.29142913825254585, + "learning_rate": 8.58619230484374e-05, + "loss": 0.691, + "step": 524 + }, + { + "epoch": 0.56, + "grad_norm": 0.26612842848899165, + "learning_rate": 8.551949334708415e-05, + "loss": 0.662, + "step": 525 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.2815041444215433, + "learning_rate": 8.51772369907522e-05, + "loss": 0.7068, + "step": 526 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.32699659703784895, + "learning_rate": 8.483515807656576e-05, + "loss": 0.7203, + "step": 527 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2971502635394618, + "learning_rate": 8.449326069952506e-05, + "loss": 0.6996, + "step": 528 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.2888587243184356, + "learning_rate": 8.415154895245697e-05, + "loss": 0.6374, + "step": 529 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.3902497947590015, + "learning_rate": 8.381002692596635e-05, + "loss": 0.6656, + "step": 530 + }, + { + "epoch": 0.5664, + "grad_norm": 0.307671581911795, + "learning_rate": 8.346869870838685e-05, + "loss": 0.7331, + "step": 531 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.2724483232898439, + "learning_rate": 8.312756838573208e-05, + "loss": 0.6452, + "step": 532 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.2983827706569765, + "learning_rate": 8.278664004164665e-05, + "loss": 0.6677, + "step": 533 + }, + { + "epoch": 0.5696, + "grad_norm": 0.2787531753541985, + "learning_rate": 8.244591775735732e-05, + "loss": 0.6478, + "step": 534 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.283768099032677, + "learning_rate": 8.210540561162412e-05, + "loss": 0.6752, + "step": 535 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.291717309703545, + "learning_rate": 8.176510768069147e-05, + "loss": 0.7109, + "step": 536 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3031358111163699, + "learning_rate": 8.142502803823955e-05, + "loss": 0.7195, + "step": 537 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.29478240014249535, + "learning_rate": 8.108517075533531e-05, + "loss": 0.691, + "step": 538 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.29709353597605226, + "learning_rate": 8.074553990038395e-05, + "loss": 0.6488, + "step": 539 + }, + { + "epoch": 0.576, + "grad_norm": 0.29440998991459344, + "learning_rate": 8.040613953908005e-05, + "loss": 0.7157, + "step": 540 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.29573345625739195, + "learning_rate": 8.0066973734359e-05, + "loss": 0.6637, + "step": 541 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.32999656344904255, + "learning_rate": 7.972804654634834e-05, + "loss": 0.7149, + "step": 542 + }, + { + "epoch": 0.5792, + "grad_norm": 0.306402388226421, + "learning_rate": 7.938936203231912e-05, + "loss": 0.7093, + "step": 543 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.29023525020313806, + "learning_rate": 7.905092424663735e-05, + "loss": 0.6672, + "step": 544 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.2831298509229484, + "learning_rate": 7.871273724071553e-05, + "loss": 0.6462, + "step": 545 + }, + { + "epoch": 0.5824, + "grad_norm": 0.2813178654912594, + "learning_rate": 7.837480506296404e-05, + "loss": 0.6305, + "step": 546 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.2662660017764881, + "learning_rate": 7.803713175874275e-05, + "loss": 0.6498, + "step": 547 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.2817750011294188, + "learning_rate": 7.769972137031262e-05, + "loss": 0.6254, + "step": 548 + }, + { + "epoch": 0.5856, + "grad_norm": 0.29468313237397137, + "learning_rate": 7.736257793678714e-05, + "loss": 0.6975, + "step": 549 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.3032568151281726, + "learning_rate": 7.702570549408428e-05, + "loss": 0.67, + "step": 550 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.31411653264107314, + "learning_rate": 7.668910807487783e-05, + "loss": 0.7137, + "step": 551 + }, + { + "epoch": 0.5888, + "grad_norm": 0.2938706074022764, + "learning_rate": 7.635278970854943e-05, + "loss": 0.6789, + "step": 552 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.2623978006966498, + "learning_rate": 7.601675442114009e-05, + "loss": 0.6418, + "step": 553 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.3092538965585159, + "learning_rate": 7.568100623530217e-05, + "loss": 0.7048, + "step": 554 + }, + { + "epoch": 0.592, + "grad_norm": 0.2895798939877575, + "learning_rate": 7.534554917025119e-05, + "loss": 0.656, + "step": 555 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.32535827755481617, + "learning_rate": 7.501038724171756e-05, + "loss": 0.7482, + "step": 556 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.28040339929597097, + "learning_rate": 7.46755244618988e-05, + "loss": 0.6523, + "step": 557 + }, + { + "epoch": 0.5952, + "grad_norm": 0.28481551363714974, + "learning_rate": 7.434096483941115e-05, + "loss": 0.6937, + "step": 558 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.27658244979840724, + "learning_rate": 7.400671237924202e-05, + "loss": 0.6632, + "step": 559 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.28672018705481, + "learning_rate": 7.367277108270156e-05, + "loss": 0.7122, + "step": 560 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3054070599002515, + "learning_rate": 7.333914494737514e-05, + "loss": 0.7462, + "step": 561 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.26066922267536863, + "learning_rate": 7.300583796707539e-05, + "loss": 0.6526, + "step": 562 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.2663942533888753, + "learning_rate": 7.267285413179421e-05, + "loss": 0.6401, + "step": 563 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3074810965889515, + "learning_rate": 7.234019742765532e-05, + "loss": 0.7171, + "step": 564 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.32327663932539985, + "learning_rate": 7.200787183686625e-05, + "loss": 0.7023, + "step": 565 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.27775451757768665, + "learning_rate": 7.167588133767091e-05, + "loss": 0.6617, + "step": 566 + }, + { + "epoch": 0.6048, + "grad_norm": 0.26045639881270616, + "learning_rate": 7.134422990430176e-05, + "loss": 0.5921, + "step": 567 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.2760200078362994, + "learning_rate": 7.101292150693241e-05, + "loss": 0.6624, + "step": 568 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.284862759323287, + "learning_rate": 7.068196011162994e-05, + "loss": 0.6677, + "step": 569 + }, + { + "epoch": 0.608, + "grad_norm": 0.3102663214082485, + "learning_rate": 7.03513496803075e-05, + "loss": 0.665, + "step": 570 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.3204096685591165, + "learning_rate": 7.002109417067697e-05, + "loss": 0.6168, + "step": 571 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.2809619276552323, + "learning_rate": 6.969119753620135e-05, + "loss": 0.6771, + "step": 572 + }, + { + "epoch": 0.6112, + "grad_norm": 0.27930273371137115, + "learning_rate": 6.936166372604773e-05, + "loss": 0.6595, + "step": 573 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.27393534135607, + "learning_rate": 6.903249668503972e-05, + "loss": 0.6543, + "step": 574 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.3133322498095281, + "learning_rate": 6.87037003536104e-05, + "loss": 0.6845, + "step": 575 + }, + { + "epoch": 0.6144, + "grad_norm": 0.292942414360091, + "learning_rate": 6.837527866775522e-05, + "loss": 0.6255, + "step": 576 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.2800959719802186, + "learning_rate": 6.804723555898458e-05, + "loss": 0.6409, + "step": 577 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.31656690985698044, + "learning_rate": 6.771957495427716e-05, + "loss": 0.7182, + "step": 578 + }, + { + "epoch": 0.6176, + "grad_norm": 0.33320353446119055, + "learning_rate": 6.739230077603259e-05, + "loss": 0.7104, + "step": 579 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.28493767473138093, + "learning_rate": 6.706541694202471e-05, + "loss": 0.6851, + "step": 580 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.27565430496802135, + "learning_rate": 6.673892736535448e-05, + "loss": 0.6306, + "step": 581 + }, + { + "epoch": 0.6208, + "grad_norm": 0.26208129531868224, + "learning_rate": 6.641283595440323e-05, + "loss": 0.6525, + "step": 582 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.28243055346170187, + "learning_rate": 6.608714661278606e-05, + "loss": 0.6653, + "step": 583 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.2871697561300585, + "learning_rate": 6.576186323930466e-05, + "loss": 0.672, + "step": 584 + }, + { + "epoch": 0.624, + "grad_norm": 0.2843702328109472, + "learning_rate": 6.543698972790117e-05, + "loss": 0.6925, + "step": 585 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.2841216673228526, + "learning_rate": 6.51125299676111e-05, + "loss": 0.6489, + "step": 586 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.30262406942134606, + "learning_rate": 6.478848784251713e-05, + "loss": 0.7315, + "step": 587 + }, + { + "epoch": 0.6272, + "grad_norm": 0.2798472247395526, + "learning_rate": 6.446486723170236e-05, + "loss": 0.6645, + "step": 588 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.2667725923356366, + "learning_rate": 6.414167200920391e-05, + "loss": 0.6398, + "step": 589 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.3901233462085959, + "learning_rate": 6.381890604396687e-05, + "loss": 0.671, + "step": 590 + }, + { + "epoch": 0.6304, + "grad_norm": 0.32802964775015336, + "learning_rate": 6.349657319979742e-05, + "loss": 0.6896, + "step": 591 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.29918042413694523, + "learning_rate": 6.317467733531712e-05, + "loss": 0.6683, + "step": 592 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.27730506676072064, + "learning_rate": 6.28532223039163e-05, + "loss": 0.671, + "step": 593 + }, + { + "epoch": 0.6336, + "grad_norm": 0.31656170023615376, + "learning_rate": 6.253221195370826e-05, + "loss": 0.6984, + "step": 594 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.27858978923058714, + "learning_rate": 6.221165012748297e-05, + "loss": 0.6648, + "step": 595 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.27716740679748453, + "learning_rate": 6.189154066266112e-05, + "loss": 0.6603, + "step": 596 + }, + { + "epoch": 0.6368, + "grad_norm": 0.32373094969876837, + "learning_rate": 6.157188739124834e-05, + "loss": 0.7869, + "step": 597 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.32905428766166533, + "learning_rate": 6.125269413978907e-05, + "loss": 0.6604, + "step": 598 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.2827813439312809, + "learning_rate": 6.093396472932103e-05, + "loss": 0.6503, + "step": 599 + }, + { + "epoch": 0.64, + "grad_norm": 0.27990982670223624, + "learning_rate": 6.0615702975329194e-05, + "loss": 0.6586, + "step": 600 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.31279521216643086, + "learning_rate": 6.029791268770029e-05, + "loss": 0.6517, + "step": 601 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.2911609136627562, + "learning_rate": 5.998059767067728e-05, + "loss": 0.6345, + "step": 602 + }, + { + "epoch": 0.6432, + "grad_norm": 0.27966152609036526, + "learning_rate": 5.9663761722813495e-05, + "loss": 0.6227, + "step": 603 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.2939451053442902, + "learning_rate": 5.934740863692759e-05, + "loss": 0.6706, + "step": 604 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.29298604755571256, + "learning_rate": 5.903154220005771e-05, + "loss": 0.6552, + "step": 605 + }, + { + "epoch": 0.6464, + "grad_norm": 0.2946108030220765, + "learning_rate": 5.871616619341653e-05, + "loss": 0.7258, + "step": 606 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.2695643421152406, + "learning_rate": 5.840128439234571e-05, + "loss": 0.6397, + "step": 607 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.2685510979232542, + "learning_rate": 5.80869005662708e-05, + "loss": 0.6463, + "step": 608 + }, + { + "epoch": 0.6496, + "grad_norm": 0.26743508036139063, + "learning_rate": 5.777301847865629e-05, + "loss": 0.622, + "step": 609 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.2861940883382232, + "learning_rate": 5.7459641886960244e-05, + "loss": 0.6847, + "step": 610 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.28751647313232404, + "learning_rate": 5.714677454258947e-05, + "loss": 0.6553, + "step": 611 + }, + { + "epoch": 0.6528, + "grad_norm": 0.2973098924609291, + "learning_rate": 5.6834420190854745e-05, + "loss": 0.6411, + "step": 612 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.26866698877101813, + "learning_rate": 5.652258257092569e-05, + "loss": 0.6042, + "step": 613 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.27277054436317155, + "learning_rate": 5.621126541578632e-05, + "loss": 0.6305, + "step": 614 + }, + { + "epoch": 0.656, + "grad_norm": 0.29334647465461267, + "learning_rate": 5.590047245219009e-05, + "loss": 0.6677, + "step": 615 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.2692181748192527, + "learning_rate": 5.559020740061549e-05, + "loss": 0.6441, + "step": 616 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.2875655942770087, + "learning_rate": 5.528047397522133e-05, + "loss": 0.7035, + "step": 617 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3196039710835889, + "learning_rate": 5.497127588380244e-05, + "loss": 0.7077, + "step": 618 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.25514688966767723, + "learning_rate": 5.4662616827745185e-05, + "loss": 0.6279, + "step": 619 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.30966996310288775, + "learning_rate": 5.4354500501983074e-05, + "loss": 0.7202, + "step": 620 + }, + { + "epoch": 0.6624, + "grad_norm": 0.2749126392137727, + "learning_rate": 5.404693059495285e-05, + "loss": 0.6205, + "step": 621 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.3829206361819499, + "learning_rate": 5.373991078854992e-05, + "loss": 0.6853, + "step": 622 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.302528210142608, + "learning_rate": 5.3433444758084604e-05, + "loss": 0.6858, + "step": 623 + }, + { + "epoch": 0.6656, + "grad_norm": 0.2770927219737785, + "learning_rate": 5.312753617223794e-05, + "loss": 0.6262, + "step": 624 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.29440427360769594, + "learning_rate": 5.282218869301788e-05, + "loss": 0.6997, + "step": 625 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.2738871723904596, + "learning_rate": 5.251740597571542e-05, + "loss": 0.6366, + "step": 626 + }, + { + "epoch": 0.6688, + "grad_norm": 0.29430307578778014, + "learning_rate": 5.221319166886073e-05, + "loss": 0.6784, + "step": 627 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.2931850021619955, + "learning_rate": 5.190954941417977e-05, + "loss": 0.6877, + "step": 628 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.2766784624087078, + "learning_rate": 5.160648284655032e-05, + "loss": 0.6297, + "step": 629 + }, + { + "epoch": 0.672, + "grad_norm": 0.28563037511990763, + "learning_rate": 5.1303995593958824e-05, + "loss": 0.6565, + "step": 630 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.2571946346254562, + "learning_rate": 5.100209127745661e-05, + "loss": 0.6004, + "step": 631 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.335776447635849, + "learning_rate": 5.0700773511116906e-05, + "loss": 0.7161, + "step": 632 + }, + { + "epoch": 0.6752, + "grad_norm": 0.27596423062496944, + "learning_rate": 5.040004590199128e-05, + "loss": 0.6356, + "step": 633 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.26803594304831474, + "learning_rate": 5.0099912050066556e-05, + "loss": 0.6321, + "step": 634 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.2841437162450076, + "learning_rate": 4.9800375548221845e-05, + "loss": 0.6633, + "step": 635 + }, + { + "epoch": 0.6784, + "grad_norm": 0.2909509540859932, + "learning_rate": 4.950143998218531e-05, + "loss": 0.6738, + "step": 636 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.4839290382099412, + "learning_rate": 4.920310893049146e-05, + "loss": 0.7175, + "step": 637 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.28786760320456234, + "learning_rate": 4.89053859644381e-05, + "loss": 0.6489, + "step": 638 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3066173657523897, + "learning_rate": 4.860827464804383e-05, + "loss": 0.6622, + "step": 639 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.31712847314029313, + "learning_rate": 4.831177853800511e-05, + "loss": 0.7302, + "step": 640 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.314127065264689, + "learning_rate": 4.801590118365383e-05, + "loss": 0.6554, + "step": 641 + }, + { + "epoch": 0.6848, + "grad_norm": 0.2912563986330789, + "learning_rate": 4.77206461269149e-05, + "loss": 0.6997, + "step": 642 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.2723475608329597, + "learning_rate": 4.7426016902263636e-05, + "loss": 0.6366, + "step": 643 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.2857812171044624, + "learning_rate": 4.713201703668367e-05, + "loss": 0.6396, + "step": 644 + }, + { + "epoch": 0.688, + "grad_norm": 0.269722291822516, + "learning_rate": 4.683865004962452e-05, + "loss": 0.6718, + "step": 645 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.27127114300010835, + "learning_rate": 4.654591945295969e-05, + "loss": 0.601, + "step": 646 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.2875125469368505, + "learning_rate": 4.6253828750944375e-05, + "loss": 0.6491, + "step": 647 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2820172283412604, + "learning_rate": 4.596238144017369e-05, + "loss": 0.6605, + "step": 648 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.28215979461344326, + "learning_rate": 4.567158100954083e-05, + "loss": 0.6455, + "step": 649 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.30445950051799525, + "learning_rate": 4.53814309401951e-05, + "loss": 0.6693, + "step": 650 + }, + { + "epoch": 0.6944, + "grad_norm": 0.289393432300126, + "learning_rate": 4.509193470550056e-05, + "loss": 0.6626, + "step": 651 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.29159567744282455, + "learning_rate": 4.4803095770994106e-05, + "loss": 0.6772, + "step": 652 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.3153818045447706, + "learning_rate": 4.4514917594344184e-05, + "loss": 0.7167, + "step": 653 + }, + { + "epoch": 0.6976, + "grad_norm": 0.2900916187308584, + "learning_rate": 4.422740362530945e-05, + "loss": 0.6325, + "step": 654 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.35486349786173416, + "learning_rate": 4.3940557305697226e-05, + "loss": 0.6996, + "step": 655 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.2947295595910589, + "learning_rate": 4.3654382069322644e-05, + "loss": 0.6797, + "step": 656 + }, + { + "epoch": 0.7008, + "grad_norm": 0.29827049603167827, + "learning_rate": 4.3368881341967135e-05, + "loss": 0.6707, + "step": 657 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.2930065259590927, + "learning_rate": 4.308405854133786e-05, + "loss": 0.6626, + "step": 658 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.2684168453018188, + "learning_rate": 4.2799917077026394e-05, + "loss": 0.6452, + "step": 659 + }, + { + "epoch": 0.704, + "grad_norm": 0.2933993105793805, + "learning_rate": 4.251646035046814e-05, + "loss": 0.6877, + "step": 660 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.28510513967157963, + "learning_rate": 4.223369175490162e-05, + "loss": 0.6395, + "step": 661 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.28430008291021985, + "learning_rate": 4.195161467532769e-05, + "loss": 0.6833, + "step": 662 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3098944003473299, + "learning_rate": 4.167023248846925e-05, + "loss": 0.7245, + "step": 663 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.26285919211526726, + "learning_rate": 4.138954856273054e-05, + "loss": 0.6256, + "step": 664 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.29079985687631693, + "learning_rate": 4.110956625815713e-05, + "loss": 0.6689, + "step": 665 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3038385015807507, + "learning_rate": 4.083028892639541e-05, + "loss": 0.687, + "step": 666 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.31326059865547873, + "learning_rate": 4.055171991065262e-05, + "loss": 0.6164, + "step": 667 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.31823161383866727, + "learning_rate": 4.027386254565688e-05, + "loss": 0.6914, + "step": 668 + }, + { + "epoch": 0.7136, + "grad_norm": 0.28351722713064437, + "learning_rate": 3.9996720157617094e-05, + "loss": 0.6716, + "step": 669 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.2863134817616585, + "learning_rate": 3.972029606418335e-05, + "loss": 0.66, + "step": 670 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.2890830404906038, + "learning_rate": 3.9444593574406915e-05, + "loss": 0.6781, + "step": 671 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3008629834692028, + "learning_rate": 3.9169615988701e-05, + "loss": 0.6478, + "step": 672 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.30229004417007266, + "learning_rate": 3.8895366598800896e-05, + "loss": 0.6962, + "step": 673 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.29431364789393194, + "learning_rate": 3.862184868772473e-05, + "loss": 0.6799, + "step": 674 + }, + { + "epoch": 0.72, + "grad_norm": 0.2846214083775447, + "learning_rate": 3.834906552973424e-05, + "loss": 0.6785, + "step": 675 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.3012354151426349, + "learning_rate": 3.807702039029539e-05, + "loss": 0.7035, + "step": 676 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.3063494102115359, + "learning_rate": 3.780571652603949e-05, + "loss": 0.6569, + "step": 677 + }, + { + "epoch": 0.7232, + "grad_norm": 0.29788653775709056, + "learning_rate": 3.753515718472402e-05, + "loss": 0.6765, + "step": 678 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.2664276148587841, + "learning_rate": 3.726534560519381e-05, + "loss": 0.6371, + "step": 679 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.3041428329482516, + "learning_rate": 3.6996285017342406e-05, + "loss": 0.6453, + "step": 680 + }, + { + "epoch": 0.7264, + "grad_norm": 0.2907425254813268, + "learning_rate": 3.672797864207316e-05, + "loss": 0.6739, + "step": 681 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.27481374083401466, + "learning_rate": 3.646042969126093e-05, + "loss": 0.6422, + "step": 682 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.28869767017809994, + "learning_rate": 3.619364136771337e-05, + "loss": 0.685, + "step": 683 + }, + { + "epoch": 0.7296, + "grad_norm": 0.27629920246099, + "learning_rate": 3.5927616865132884e-05, + "loss": 0.62, + "step": 684 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.36573334865015195, + "learning_rate": 3.566235936807808e-05, + "loss": 0.6385, + "step": 685 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.3306751761302303, + "learning_rate": 3.539787205192586e-05, + "loss": 0.7023, + "step": 686 + }, + { + "epoch": 0.7328, + "grad_norm": 0.29786552166099595, + "learning_rate": 3.513415808283341e-05, + "loss": 0.6945, + "step": 687 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.3200615573895231, + "learning_rate": 3.4871220617700126e-05, + "loss": 0.697, + "step": 688 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.2917494354935264, + "learning_rate": 3.460906280413007e-05, + "loss": 0.6687, + "step": 689 + }, + { + "epoch": 0.736, + "grad_norm": 0.2881025206406779, + "learning_rate": 3.4347687780394e-05, + "loss": 0.65, + "step": 690 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.277026993832784, + "learning_rate": 3.4087098675392104e-05, + "loss": 0.6642, + "step": 691 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.28718483143700635, + "learning_rate": 3.382729860861632e-05, + "loss": 0.6399, + "step": 692 + }, + { + "epoch": 0.7392, + "grad_norm": 0.2861967168150272, + "learning_rate": 3.3568290690113034e-05, + "loss": 0.6412, + "step": 693 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.28223210863198006, + "learning_rate": 3.331007802044601e-05, + "loss": 0.6248, + "step": 694 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.29237354581718883, + "learning_rate": 3.305266369065901e-05, + "loss": 0.6278, + "step": 695 + }, + { + "epoch": 0.7424, + "grad_norm": 0.30077367438476793, + "learning_rate": 3.279605078223906e-05, + "loss": 0.651, + "step": 696 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.30182926963265716, + "learning_rate": 3.25402423670793e-05, + "loss": 0.6802, + "step": 697 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.27123008878769184, + "learning_rate": 3.228524150744249e-05, + "loss": 0.6459, + "step": 698 + }, + { + "epoch": 0.7456, + "grad_norm": 0.31528566649855466, + "learning_rate": 3.2031051255924085e-05, + "loss": 0.7312, + "step": 699 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.2888019042238811, + "learning_rate": 3.1777674655415834e-05, + "loss": 0.5961, + "step": 700 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.28995831934582733, + "learning_rate": 3.1525114739069415e-05, + "loss": 0.6922, + "step": 701 + }, + { + "epoch": 0.7488, + "grad_norm": 0.2980666425502893, + "learning_rate": 3.127337453025994e-05, + "loss": 0.6972, + "step": 702 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.2940912890470747, + "learning_rate": 3.102245704254995e-05, + "loss": 0.6777, + "step": 703 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.27656670242701575, + "learning_rate": 3.077236527965318e-05, + "loss": 0.6446, + "step": 704 + }, + { + "epoch": 0.752, + "grad_norm": 0.2728768920242138, + "learning_rate": 3.0523102235398714e-05, + "loss": 0.6429, + "step": 705 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.2651053636028773, + "learning_rate": 3.0274670893695147e-05, + "loss": 0.5969, + "step": 706 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.290043020958257, + "learning_rate": 3.002707422849472e-05, + "loss": 0.6808, + "step": 707 + }, + { + "epoch": 0.7552, + "grad_norm": 0.30388118204687087, + "learning_rate": 2.978031520375798e-05, + "loss": 0.6818, + "step": 708 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.2872867060365483, + "learning_rate": 2.9534396773417994e-05, + "loss": 0.6322, + "step": 709 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.3012089202756257, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.6655, + "step": 710 + }, + { + "epoch": 0.7584, + "grad_norm": 0.28988814577536387, + "learning_rate": 2.9045093461312258e-05, + "loss": 0.6522, + "step": 711 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.28659804356967133, + "learning_rate": 2.8801714436958416e-05, + "loss": 0.6688, + "step": 712 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.27869443672430017, + "learning_rate": 2.855918772175522e-05, + "loss": 0.6187, + "step": 713 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3218060159736909, + "learning_rate": 2.8317516218971073e-05, + "loss": 0.6971, + "step": 714 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.2735284606053201, + "learning_rate": 2.8076702821636867e-05, + "loss": 0.6375, + "step": 715 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.2890031173362515, + "learning_rate": 2.7836750412511016e-05, + "loss": 0.6506, + "step": 716 + }, + { + "epoch": 0.7648, + "grad_norm": 0.29823710588390784, + "learning_rate": 2.7597661864045233e-05, + "loss": 0.7002, + "step": 717 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.32196430556211336, + "learning_rate": 2.735944003834997e-05, + "loss": 0.6594, + "step": 718 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.2952511186657231, + "learning_rate": 2.7122087787160166e-05, + "loss": 0.6295, + "step": 719 + }, + { + "epoch": 0.768, + "grad_norm": 0.2781234638273664, + "learning_rate": 2.688560795180126e-05, + "loss": 0.6578, + "step": 720 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.2908767563059737, + "learning_rate": 2.6650003363154963e-05, + "loss": 0.6637, + "step": 721 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.2768874140636074, + "learning_rate": 2.641527684162556e-05, + "loss": 0.6426, + "step": 722 + }, + { + "epoch": 0.7712, + "grad_norm": 0.31144606853893736, + "learning_rate": 2.6181431197105998e-05, + "loss": 0.7357, + "step": 723 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.982380863411313, + "learning_rate": 2.5948469228944318e-05, + "loss": 0.6872, + "step": 724 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.2879611855890254, + "learning_rate": 2.5716393725910215e-05, + "loss": 0.6256, + "step": 725 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2974019467722814, + "learning_rate": 2.5485207466161466e-05, + "loss": 0.6188, + "step": 726 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.28614307588026977, + "learning_rate": 2.5254913217210886e-05, + "loss": 0.6449, + "step": 727 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.30905402203640764, + "learning_rate": 2.5025513735893014e-05, + "loss": 0.7016, + "step": 728 + }, + { + "epoch": 0.7776, + "grad_norm": 0.30339890688065574, + "learning_rate": 2.47970117683313e-05, + "loss": 0.6477, + "step": 729 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.2700728879961785, + "learning_rate": 2.4569410049905016e-05, + "loss": 0.6425, + "step": 730 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.28514224544409006, + "learning_rate": 2.434271130521666e-05, + "loss": 0.6494, + "step": 731 + }, + { + "epoch": 0.7808, + "grad_norm": 0.2952141878210136, + "learning_rate": 2.411691824805934e-05, + "loss": 0.6424, + "step": 732 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.28925194465207066, + "learning_rate": 2.389203358138419e-05, + "loss": 0.6669, + "step": 733 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.26771131590299224, + "learning_rate": 2.3668059997268144e-05, + "loss": 0.6254, + "step": 734 + }, + { + "epoch": 0.784, + "grad_norm": 0.29576421959665344, + "learning_rate": 2.3445000176881537e-05, + "loss": 0.6986, + "step": 735 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.310596638325244, + "learning_rate": 2.3222856790456226e-05, + "loss": 0.6636, + "step": 736 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.30693727798424675, + "learning_rate": 2.3001632497253424e-05, + "loss": 0.6667, + "step": 737 + }, + { + "epoch": 0.7872, + "grad_norm": 0.2626316622716067, + "learning_rate": 2.2781329945531936e-05, + "loss": 0.6046, + "step": 738 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.28520013572402136, + "learning_rate": 2.2561951772516587e-05, + "loss": 0.607, + "step": 739 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.2787390836412758, + "learning_rate": 2.2343500604366374e-05, + "loss": 0.6679, + "step": 740 + }, + { + "epoch": 0.7904, + "grad_norm": 0.2675406467690718, + "learning_rate": 2.2125979056143364e-05, + "loss": 0.6282, + "step": 741 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.29308253389415895, + "learning_rate": 2.190938973178105e-05, + "loss": 0.6745, + "step": 742 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.2963475640253147, + "learning_rate": 2.169373522405349e-05, + "loss": 0.6075, + "step": 743 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3029314372436365, + "learning_rate": 2.1479018114544026e-05, + "loss": 0.6816, + "step": 744 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.33429077648590755, + "learning_rate": 2.1265240973614486e-05, + "loss": 0.6748, + "step": 745 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.2914498582316359, + "learning_rate": 2.105240636037449e-05, + "loss": 0.6704, + "step": 746 + }, + { + "epoch": 0.7968, + "grad_norm": 0.2751680280645356, + "learning_rate": 2.0840516822650614e-05, + "loss": 0.6588, + "step": 747 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.2806967013216181, + "learning_rate": 2.0629574896956126e-05, + "loss": 0.6421, + "step": 748 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.2938583566093967, + "learning_rate": 2.0419583108460418e-05, + "loss": 0.6584, + "step": 749 + }, + { + "epoch": 0.8, + "grad_norm": 0.2648036676215061, + "learning_rate": 2.0210543970958872e-05, + "loss": 0.6172, + "step": 750 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.30923057214752575, + "learning_rate": 2.0002459986842825e-05, + "loss": 0.6791, + "step": 751 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.30555190499212365, + "learning_rate": 1.9795333647069448e-05, + "loss": 0.7012, + "step": 752 + }, + { + "epoch": 0.8032, + "grad_norm": 0.2892376855641532, + "learning_rate": 1.958916743113214e-05, + "loss": 0.6503, + "step": 753 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.3130207684063405, + "learning_rate": 1.93839638070306e-05, + "loss": 0.688, + "step": 754 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.3038811959433142, + "learning_rate": 1.9179725231241564e-05, + "loss": 0.6412, + "step": 755 + }, + { + "epoch": 0.8064, + "grad_norm": 0.27038678722207404, + "learning_rate": 1.8976454148689127e-05, + "loss": 0.6132, + "step": 756 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.32290200573388256, + "learning_rate": 1.877415299271561e-05, + "loss": 0.6892, + "step": 757 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.29953694143025295, + "learning_rate": 1.857282418505253e-05, + "loss": 0.5971, + "step": 758 + }, + { + "epoch": 0.8096, + "grad_norm": 0.29367145518346255, + "learning_rate": 1.8372470135791344e-05, + "loss": 0.6953, + "step": 759 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.35767355711505755, + "learning_rate": 1.8173093243354878e-05, + "loss": 0.704, + "step": 760 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.3357107557671172, + "learning_rate": 1.7974695894468384e-05, + "loss": 0.7061, + "step": 761 + }, + { + "epoch": 0.8128, + "grad_norm": 0.2668410546595407, + "learning_rate": 1.7777280464131197e-05, + "loss": 0.6159, + "step": 762 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.24192534198216858, + "learning_rate": 1.7580849315588068e-05, + "loss": 0.5873, + "step": 763 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.2690632019973026, + "learning_rate": 1.7385404800301007e-05, + "loss": 0.6209, + "step": 764 + }, + { + "epoch": 0.816, + "grad_norm": 0.2932124349468624, + "learning_rate": 1.7190949257921196e-05, + "loss": 0.6489, + "step": 765 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.2973507608728614, + "learning_rate": 1.6997485016260793e-05, + "loss": 0.6502, + "step": 766 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.27381684125841704, + "learning_rate": 1.680501439126525e-05, + "loss": 0.6415, + "step": 767 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3001301983794928, + "learning_rate": 1.6613539686985458e-05, + "loss": 0.6475, + "step": 768 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.3189832068704782, + "learning_rate": 1.642306319555027e-05, + "loss": 0.659, + "step": 769 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.28189982638652405, + "learning_rate": 1.6233587197138968e-05, + "loss": 0.6468, + "step": 770 + }, + { + "epoch": 0.8224, + "grad_norm": 0.30258418883378585, + "learning_rate": 1.6045113959953985e-05, + "loss": 0.6568, + "step": 771 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.31503912109399296, + "learning_rate": 1.585764574019388e-05, + "loss": 0.6878, + "step": 772 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.2980185956983998, + "learning_rate": 1.5671184782026106e-05, + "loss": 0.6806, + "step": 773 + }, + { + "epoch": 0.8256, + "grad_norm": 0.28729609894612906, + "learning_rate": 1.548573331756038e-05, + "loss": 0.6475, + "step": 774 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.290456116643064, + "learning_rate": 1.530129356682175e-05, + "loss": 0.6642, + "step": 775 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.2676401795213975, + "learning_rate": 1.5117867737724134e-05, + "loss": 0.6322, + "step": 776 + }, + { + "epoch": 0.8288, + "grad_norm": 0.291436408710233, + "learning_rate": 1.4935458026043959e-05, + "loss": 0.6869, + "step": 777 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.28161522370484643, + "learning_rate": 1.4754066615393668e-05, + "loss": 0.6575, + "step": 778 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.25659119663294533, + "learning_rate": 1.457369567719581e-05, + "loss": 0.6293, + "step": 779 + }, + { + "epoch": 0.832, + "grad_norm": 0.2677282975104176, + "learning_rate": 1.4394347370656836e-05, + "loss": 0.6139, + "step": 780 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.2669011346374821, + "learning_rate": 1.4216023842741455e-05, + "loss": 0.6118, + "step": 781 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.29119163663377057, + "learning_rate": 1.4038727228146753e-05, + "loss": 0.6697, + "step": 782 + }, + { + "epoch": 0.8352, + "grad_norm": 0.2848690193602461, + "learning_rate": 1.3862459649276715e-05, + "loss": 0.637, + "step": 783 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.258917119913294, + "learning_rate": 1.3687223216216904e-05, + "loss": 0.6252, + "step": 784 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.28269640609184815, + "learning_rate": 1.3513020026709023e-05, + "loss": 0.6346, + "step": 785 + }, + { + "epoch": 0.8384, + "grad_norm": 0.28282334610546944, + "learning_rate": 1.3339852166125954e-05, + "loss": 0.6553, + "step": 786 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.28753951701300084, + "learning_rate": 1.3167721707446678e-05, + "loss": 0.6618, + "step": 787 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.29267116605757804, + "learning_rate": 1.2996630711231616e-05, + "loss": 0.6513, + "step": 788 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3007934582902463, + "learning_rate": 1.2826581225597767e-05, + "loss": 0.6325, + "step": 789 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.2956581913246511, + "learning_rate": 1.26575752861943e-05, + "loss": 0.6602, + "step": 790 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.25926169436931107, + "learning_rate": 1.248961491617826e-05, + "loss": 0.6141, + "step": 791 + }, + { + "epoch": 0.8448, + "grad_norm": 0.27382235573934904, + "learning_rate": 1.2322702126190156e-05, + "loss": 0.6434, + "step": 792 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.26936270513648847, + "learning_rate": 1.2156838914330072e-05, + "loss": 0.6398, + "step": 793 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.298932807167484, + "learning_rate": 1.1992027266133598e-05, + "loss": 0.703, + "step": 794 + }, + { + "epoch": 0.848, + "grad_norm": 0.2837987918824917, + "learning_rate": 1.1828269154548244e-05, + "loss": 0.6735, + "step": 795 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.287152453838261, + "learning_rate": 1.1665566539909623e-05, + "loss": 0.6303, + "step": 796 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.3067661790509156, + "learning_rate": 1.1503921369918091e-05, + "loss": 0.6821, + "step": 797 + }, + { + "epoch": 0.8512, + "grad_norm": 0.33194786910939317, + "learning_rate": 1.1343335579615467e-05, + "loss": 0.7115, + "step": 798 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.26636863988989745, + "learning_rate": 1.118381109136174e-05, + "loss": 0.6227, + "step": 799 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.2607029457520605, + "learning_rate": 1.1025349814812224e-05, + "loss": 0.6156, + "step": 800 + }, + { + "epoch": 0.8544, + "grad_norm": 0.2963260952267333, + "learning_rate": 1.0867953646894525e-05, + "loss": 0.623, + "step": 801 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.27555470116551417, + "learning_rate": 1.0711624471785986e-05, + "loss": 0.6176, + "step": 802 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.2671017143664467, + "learning_rate": 1.055636416089102e-05, + "loss": 0.6119, + "step": 803 + }, + { + "epoch": 0.8576, + "grad_norm": 0.2743025463236183, + "learning_rate": 1.0402174572818723e-05, + "loss": 0.6139, + "step": 804 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.2901714095169638, + "learning_rate": 1.0249057553360742e-05, + "loss": 0.6618, + "step": 805 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.2738482219598366, + "learning_rate": 1.0097014935468984e-05, + "loss": 0.6259, + "step": 806 + }, + { + "epoch": 0.8608, + "grad_norm": 0.29202567106931127, + "learning_rate": 9.946048539233865e-06, + "loss": 0.6599, + "step": 807 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.3243425703816153, + "learning_rate": 9.796160171862367e-06, + "loss": 0.6358, + "step": 808 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.33226356431758064, + "learning_rate": 9.647351627656543e-06, + "loss": 0.6853, + "step": 809 + }, + { + "epoch": 0.864, + "grad_norm": 0.30129256888672756, + "learning_rate": 9.499624687991871e-06, + "loss": 0.6809, + "step": 810 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.3234482699999095, + "learning_rate": 9.352981121296134e-06, + "loss": 0.6959, + "step": 811 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.3007241867654337, + "learning_rate": 9.207422683028066e-06, + "loss": 0.6534, + "step": 812 + }, + { + "epoch": 0.8672, + "grad_norm": 0.2714633750452451, + "learning_rate": 9.062951115656403e-06, + "loss": 0.6419, + "step": 813 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.3022747403133687, + "learning_rate": 8.919568148639123e-06, + "loss": 0.6165, + "step": 814 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.28264368788822275, + "learning_rate": 8.777275498402548e-06, + "loss": 0.6908, + "step": 815 + }, + { + "epoch": 0.8704, + "grad_norm": 0.2972550502683997, + "learning_rate": 8.636074868320987e-06, + "loss": 0.6598, + "step": 816 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.3568305968989144, + "learning_rate": 8.495967948696192e-06, + "loss": 0.7169, + "step": 817 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.28174549663120607, + "learning_rate": 8.35695641673725e-06, + "loss": 0.6702, + "step": 818 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3045692578885802, + "learning_rate": 8.219041936540395e-06, + "loss": 0.625, + "step": 819 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.2693662188706793, + "learning_rate": 8.082226159069196e-06, + "loss": 0.6227, + "step": 820 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.2880666514678028, + "learning_rate": 7.946510722134692e-06, + "loss": 0.6244, + "step": 821 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2734145366439765, + "learning_rate": 7.811897250375833e-06, + "loss": 0.598, + "step": 822 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.28781638193912634, + "learning_rate": 7.678387355240057e-06, + "loss": 0.6503, + "step": 823 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.3648063431291061, + "learning_rate": 7.5459826349639436e-06, + "loss": 0.7601, + "step": 824 + }, + { + "epoch": 0.88, + "grad_norm": 0.260198966878604, + "learning_rate": 7.4146846745541506e-06, + "loss": 0.607, + "step": 825 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.2940453995650147, + "learning_rate": 7.284495045768325e-06, + "loss": 0.6568, + "step": 826 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.30230013785921245, + "learning_rate": 7.155415307096458e-06, + "loss": 0.6672, + "step": 827 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3185613783366778, + "learning_rate": 7.027447003742071e-06, + "loss": 0.6819, + "step": 828 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.2640093202070568, + "learning_rate": 6.900591667603751e-06, + "loss": 0.6028, + "step": 829 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.2742802557119247, + "learning_rate": 6.774850817256939e-06, + "loss": 0.6684, + "step": 830 + }, + { + "epoch": 0.8864, + "grad_norm": 0.28670109048885123, + "learning_rate": 6.650225957935552e-06, + "loss": 0.6473, + "step": 831 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.5482357298278248, + "learning_rate": 6.5267185815141355e-06, + "loss": 0.5979, + "step": 832 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.2636099474261866, + "learning_rate": 6.40433016648988e-06, + "loss": 0.609, + "step": 833 + }, + { + "epoch": 0.8896, + "grad_norm": 0.2827836449010556, + "learning_rate": 6.283062177965038e-06, + "loss": 0.6358, + "step": 834 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.28553530158050655, + "learning_rate": 6.162916067629254e-06, + "loss": 0.6199, + "step": 835 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.29957610722095446, + "learning_rate": 6.043893273742329e-06, + "loss": 0.6825, + "step": 836 + }, + { + "epoch": 0.8928, + "grad_norm": 0.29158334974628386, + "learning_rate": 5.925995221116853e-06, + "loss": 0.6493, + "step": 837 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.2725657370081221, + "learning_rate": 5.809223321101276e-06, + "loss": 0.6001, + "step": 838 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.2937486643773653, + "learning_rate": 5.693578971562963e-06, + "loss": 0.6055, + "step": 839 + }, + { + "epoch": 0.896, + "grad_norm": 0.29609537125577257, + "learning_rate": 5.5790635568714224e-06, + "loss": 0.6704, + "step": 840 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.2712092200190641, + "learning_rate": 5.465678447881828e-06, + "loss": 0.618, + "step": 841 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.27920206556433336, + "learning_rate": 5.3534250019184774e-06, + "loss": 0.6365, + "step": 842 + }, + { + "epoch": 0.8992, + "grad_norm": 0.2998923856232532, + "learning_rate": 5.242304562758704e-06, + "loss": 0.6897, + "step": 843 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.31632903764839043, + "learning_rate": 5.132318460616625e-06, + "loss": 0.6813, + "step": 844 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.2850338246119351, + "learning_rate": 5.023468012127364e-06, + "loss": 0.6378, + "step": 845 + }, + { + "epoch": 0.9024, + "grad_norm": 0.2759659766451079, + "learning_rate": 4.915754520331173e-06, + "loss": 0.6499, + "step": 846 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.27832284175686606, + "learning_rate": 4.8091792746578935e-06, + "loss": 0.6286, + "step": 847 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.2909582769905804, + "learning_rate": 4.703743550911543e-06, + "loss": 0.6376, + "step": 848 + }, + { + "epoch": 0.9056, + "grad_norm": 0.2946038404031169, + "learning_rate": 4.599448611254964e-06, + "loss": 0.6479, + "step": 849 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.2963271540545558, + "learning_rate": 4.496295704194819e-06, + "loss": 0.6479, + "step": 850 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.30473111778805456, + "learning_rate": 4.394286064566511e-06, + "loss": 0.6919, + "step": 851 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3031317457354698, + "learning_rate": 4.293420913519541e-06, + "loss": 0.6035, + "step": 852 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.34334348312182256, + "learning_rate": 4.193701458502807e-06, + "loss": 0.7114, + "step": 853 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.28957551815838123, + "learning_rate": 4.095128893250156e-06, + "loss": 0.6422, + "step": 854 + }, + { + "epoch": 0.912, + "grad_norm": 0.3065754791370688, + "learning_rate": 3.997704397766122e-06, + "loss": 0.6658, + "step": 855 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.3005845115860106, + "learning_rate": 3.901429138311763e-06, + "loss": 0.6942, + "step": 856 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.3098675540355066, + "learning_rate": 3.80630426739077e-06, + "loss": 0.6429, + "step": 857 + }, + { + "epoch": 0.9152, + "grad_norm": 0.31337222049268587, + "learning_rate": 3.712330923735563e-06, + "loss": 0.6247, + "step": 858 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.2505119049981171, + "learning_rate": 3.6195102322937545e-06, + "loss": 0.5928, + "step": 859 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.28032497510759613, + "learning_rate": 3.5278433042146397e-06, + "loss": 0.6205, + "step": 860 + }, + { + "epoch": 0.9184, + "grad_norm": 0.28657416861883417, + "learning_rate": 3.4373312368358944e-06, + "loss": 0.6377, + "step": 861 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.26747839757924763, + "learning_rate": 3.347975113670454e-06, + "loss": 0.5963, + "step": 862 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.2674669659014821, + "learning_rate": 3.259776004393533e-06, + "loss": 0.6228, + "step": 863 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3021573273362115, + "learning_rate": 3.1727349648298267e-06, + "loss": 0.6704, + "step": 864 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.2880074026527094, + "learning_rate": 3.086853036940862e-06, + "loss": 0.6196, + "step": 865 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.246973360426054, + "learning_rate": 3.0021312488125454e-06, + "loss": 0.5727, + "step": 866 + }, + { + "epoch": 0.9248, + "grad_norm": 0.24550040548276553, + "learning_rate": 2.9185706146428017e-06, + "loss": 0.5936, + "step": 867 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.2809872557601715, + "learning_rate": 2.836172134729509e-06, + "loss": 0.6311, + "step": 868 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.41589770308234464, + "learning_rate": 2.754936795458485e-06, + "loss": 0.6519, + "step": 869 + }, + { + "epoch": 0.928, + "grad_norm": 0.4032233034895968, + "learning_rate": 2.674865569291651e-06, + "loss": 0.6586, + "step": 870 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.3046052266443494, + "learning_rate": 2.5959594147554667e-06, + "loss": 0.6481, + "step": 871 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.2945935251929872, + "learning_rate": 2.5182192764293567e-06, + "loss": 0.6257, + "step": 872 + }, + { + "epoch": 0.9312, + "grad_norm": 0.29713554063421327, + "learning_rate": 2.4416460849345123e-06, + "loss": 0.6704, + "step": 873 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.28299124531891034, + "learning_rate": 2.366240756922644e-06, + "loss": 0.6435, + "step": 874 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.29507483432351955, + "learning_rate": 2.2920041950650783e-06, + "loss": 0.689, + "step": 875 + }, + { + "epoch": 0.9344, + "grad_norm": 0.2751642964206543, + "learning_rate": 2.218937288041956e-06, + "loss": 0.6516, + "step": 876 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.27890844581327046, + "learning_rate": 2.1470409105315283e-06, + "loss": 0.6187, + "step": 877 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.2927404678505194, + "learning_rate": 2.0763159231997674e-06, + "loss": 0.625, + "step": 878 + }, + { + "epoch": 0.9376, + "grad_norm": 0.2574656127587245, + "learning_rate": 2.0067631726899962e-06, + "loss": 0.5824, + "step": 879 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.32738864261467976, + "learning_rate": 1.938383491612794e-06, + "loss": 0.693, + "step": 880 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.2923653428142808, + "learning_rate": 1.8711776985360308e-06, + "loss": 0.6562, + "step": 881 + }, + { + "epoch": 0.9408, + "grad_norm": 0.28754266408852613, + "learning_rate": 1.805146597975016e-06, + "loss": 0.6359, + "step": 882 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.2793573603542659, + "learning_rate": 1.7402909803829525e-06, + "loss": 0.6263, + "step": 883 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.3242947248714171, + "learning_rate": 1.6766116221413774e-06, + "loss": 0.6826, + "step": 884 + }, + { + "epoch": 0.944, + "grad_norm": 0.302229428057383, + "learning_rate": 1.61410928555098e-06, + "loss": 0.6544, + "step": 885 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.2947576789054036, + "learning_rate": 1.5527847188223644e-06, + "loss": 0.6433, + "step": 886 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.30696808944507264, + "learning_rate": 1.4926386560671358e-06, + "loss": 0.6502, + "step": 887 + }, + { + "epoch": 0.9472, + "grad_norm": 0.2950325951795179, + "learning_rate": 1.433671817289184e-06, + "loss": 0.6377, + "step": 888 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.29142776403284254, + "learning_rate": 1.3758849083759352e-06, + "loss": 0.6108, + "step": 889 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.30799952597440694, + "learning_rate": 1.3192786210900033e-06, + "loss": 0.6893, + "step": 890 + }, + { + "epoch": 0.9504, + "grad_norm": 0.2649671511757792, + "learning_rate": 1.2638536330608408e-06, + "loss": 0.5532, + "step": 891 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.27907304352974543, + "learning_rate": 1.2096106077767011e-06, + "loss": 0.6194, + "step": 892 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.28586285709978776, + "learning_rate": 1.1565501945766222e-06, + "loss": 0.6213, + "step": 893 + }, + { + "epoch": 0.9536, + "grad_norm": 0.31334672185860857, + "learning_rate": 1.1046730286426775e-06, + "loss": 0.6706, + "step": 894 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.27241147401690935, + "learning_rate": 1.053979730992416e-06, + "loss": 0.6247, + "step": 895 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.30775552519743304, + "learning_rate": 1.0044709084713554e-06, + "loss": 0.6552, + "step": 896 + }, + { + "epoch": 0.9568, + "grad_norm": 0.2756705452562053, + "learning_rate": 9.56147153745779e-07, + "loss": 0.5943, + "step": 897 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.27034826424595554, + "learning_rate": 9.090090452955835e-07, + "loss": 0.6033, + "step": 898 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.3245743296461917, + "learning_rate": 8.630571474074311e-07, + "loss": 0.7088, + "step": 899 + }, + { + "epoch": 0.96, + "grad_norm": 0.28363528174996244, + "learning_rate": 8.182920101679092e-07, + "loss": 0.6403, + "step": 900 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.2894840028292518, + "learning_rate": 7.747141694570026e-07, + "loss": 0.619, + "step": 901 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.3054324030772581, + "learning_rate": 7.323241469416764e-07, + "loss": 0.7242, + "step": 902 + }, + { + "epoch": 0.9632, + "grad_norm": 0.2992287194918836, + "learning_rate": 6.911224500695702e-07, + "loss": 0.6688, + "step": 903 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.26701348964873584, + "learning_rate": 6.511095720630244e-07, + "loss": 0.6208, + "step": 904 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.2903174348444887, + "learning_rate": 6.122859919130974e-07, + "loss": 0.6571, + "step": 905 + }, + { + "epoch": 0.9664, + "grad_norm": 0.29368806634671657, + "learning_rate": 5.746521743738354e-07, + "loss": 0.6569, + "step": 906 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.267650628468648, + "learning_rate": 5.382085699567552e-07, + "loss": 0.5774, + "step": 907 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.27530275791049835, + "learning_rate": 5.029556149254266e-07, + "loss": 0.6016, + "step": 908 + }, + { + "epoch": 0.9696, + "grad_norm": 0.28684894549841344, + "learning_rate": 4.6889373129022085e-07, + "loss": 0.6626, + "step": 909 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.2858549518354566, + "learning_rate": 4.3602332680331425e-07, + "loss": 0.642, + "step": 910 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.31470427156630587, + "learning_rate": 4.0434479495378155e-07, + "loss": 0.6986, + "step": 911 + }, + { + "epoch": 0.9728, + "grad_norm": 0.29453088944233013, + "learning_rate": 3.7385851496284374e-07, + "loss": 0.6459, + "step": 912 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.2919790424173549, + "learning_rate": 3.445648517793942e-07, + "loss": 0.6313, + "step": 913 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.2903855362643007, + "learning_rate": 3.164641560756132e-07, + "loss": 0.6834, + "step": 914 + }, + { + "epoch": 0.976, + "grad_norm": 0.31817553626214073, + "learning_rate": 2.895567642427488e-07, + "loss": 0.6859, + "step": 915 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.2939831986766623, + "learning_rate": 2.638429983870983e-07, + "loss": 0.685, + "step": 916 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.3222959683434266, + "learning_rate": 2.3932316632614416e-07, + "loss": 0.7028, + "step": 917 + }, + { + "epoch": 0.9792, + "grad_norm": 0.32057820854732816, + "learning_rate": 2.15997561584913e-07, + "loss": 0.6953, + "step": 918 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.3059488091081678, + "learning_rate": 1.9386646339238924e-07, + "loss": 0.6399, + "step": 919 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.2939641229503454, + "learning_rate": 1.7293013667825098e-07, + "loss": 0.6716, + "step": 920 + }, + { + "epoch": 0.9824, + "grad_norm": 0.2994233155434253, + "learning_rate": 1.5318883206962842e-07, + "loss": 0.6587, + "step": 921 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.33888384623850104, + "learning_rate": 1.3464278588815048e-07, + "loss": 0.6649, + "step": 922 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.325329202626534, + "learning_rate": 1.1729222014709162e-07, + "loss": 0.6735, + "step": 923 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3105290082581944, + "learning_rate": 1.0113734254872942e-07, + "loss": 0.6546, + "step": 924 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.3276838609071141, + "learning_rate": 8.617834648185774e-08, + "loss": 0.625, + "step": 925 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.2948843407042844, + "learning_rate": 7.241541101945526e-08, + "loss": 0.6566, + "step": 926 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3143572595824, + "learning_rate": 5.984870091654271e-08, + "loss": 0.6728, + "step": 927 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.29404292704417323, + "learning_rate": 4.847836660824001e-08, + "loss": 0.6596, + "step": 928 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.3604294327978392, + "learning_rate": 3.8304544207945495e-08, + "loss": 0.6449, + "step": 929 + }, + { + "epoch": 0.992, + "grad_norm": 0.274139263231835, + "learning_rate": 2.9327355505681663e-08, + "loss": 0.6238, + "step": 930 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.27272103294667494, + "learning_rate": 2.1546907966685236e-08, + "loss": 0.64, + "step": 931 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.2587150141920648, + "learning_rate": 1.496329473008595e-08, + "loss": 0.6438, + "step": 932 + }, + { + "epoch": 0.9952, + "grad_norm": 0.28797703518091533, + "learning_rate": 9.576594607807465e-09, + "loss": 0.705, + "step": 933 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.2683094544959566, + "learning_rate": 5.3868720836236506e-09, + "loss": 0.6221, + "step": 934 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.27433839910302826, + "learning_rate": 2.3941773123814516e-09, + "loss": 0.6151, + "step": 935 + }, + { + "epoch": 0.9984, + "grad_norm": 0.2761447567453666, + "learning_rate": 5.985461193791509e-10, + "loss": 0.6213, + "step": 936 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.46724154826656267, + "learning_rate": 0.0, + "loss": 0.6136, + "step": 937 + }, + { + "epoch": 0.9994666666666666, + "step": 937, + "total_flos": 2443854479097856.0, + "train_loss": 0.7083672744360143, + "train_runtime": 29188.8069, + "train_samples_per_second": 1.028, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 937, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2443854479097856.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..512cbdf5265a510ca580f516f4cd258cdab4fbf7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "down_proj", + "q_proj", + "k_proj", + "o_proj", + "v_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9b7cb1a32600557510d34f2f979ff99ba26a9e62 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f9535a4f6a448826a21c3a267011f64cd8747b385292cc0463e5b5fc1f24223 +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..45e5ccdd133ad1d7c402b7953a51772bcee080da --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76eee359f5364c4d527493ac0546a952632f483327c3eb87a542d89502502844 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2904470624b646fb832ede5753619c202987ceca --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.7628357242415116, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.255, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 0.8194757458054642, + "learning_rate": 7.017543859649123e-06, + "loss": 1.2986, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 0.7958014474683687, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.3299, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.7807215158428453, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.3331, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 1.10867974246785, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.1955, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.7501373704184361, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3464, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.7594642077016195, + "learning_rate": 2.456140350877193e-05, + "loss": 1.322, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.6424306160023425, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.1769, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.6621813680604755, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1253, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.6072150043621733, + "learning_rate": 3.508771929824561e-05, + "loss": 1.0355, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.9512685642147044, + "learning_rate": 3.859649122807018e-05, + "loss": 1.0948, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9654646660953872, + "learning_rate": 4.210526315789474e-05, + "loss": 0.9527, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.7158956275127898, + "learning_rate": 4.56140350877193e-05, + "loss": 0.9619, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.7510755826538486, + "learning_rate": 4.912280701754386e-05, + "loss": 0.9568, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 0.6761493950332309, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.9471, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.6442709212470771, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.9905, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.6140912856378853, + "learning_rate": 5.9649122807017544e-05, + "loss": 0.8823, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.5927185337725561, + "learning_rate": 6.31578947368421e-05, + "loss": 0.9203, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.5885011384967356, + "learning_rate": 6.666666666666667e-05, + "loss": 0.8946, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.569978601461122, + "learning_rate": 7.017543859649122e-05, + "loss": 0.9413, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5450626692942131, + "learning_rate": 7.368421052631579e-05, + "loss": 0.8821, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.5869407362791327, + "learning_rate": 7.719298245614036e-05, + "loss": 0.9553, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.5514964151706281, + "learning_rate": 8.070175438596491e-05, + "loss": 0.8503, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.4576203847614647, + "learning_rate": 8.421052631578948e-05, + "loss": 0.8835, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.4656587189395512, + "learning_rate": 8.771929824561403e-05, + "loss": 0.8603, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.6353641328243992, + "learning_rate": 9.12280701754386e-05, + "loss": 0.9426, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.4911824668625709, + "learning_rate": 9.473684210526316e-05, + "loss": 0.887, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.5022750080958399, + "learning_rate": 9.824561403508771e-05, + "loss": 0.8704, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5488540150230059, + "learning_rate": 0.0001017543859649123, + "loss": 0.9003, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.49086221281679715, + "learning_rate": 0.00010526315789473685, + "loss": 0.8717, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.4519977501834657, + "learning_rate": 0.00010877192982456141, + "loss": 0.8111, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.5667972847554228, + "learning_rate": 0.00011228070175438597, + "loss": 0.9719, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5243396224563087, + "learning_rate": 0.00011578947368421053, + "loss": 0.8918, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.4206257007883853, + "learning_rate": 0.00011929824561403509, + "loss": 0.8267, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.4408623734911278, + "learning_rate": 0.00012280701754385965, + "loss": 0.8421, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.4310398203704329, + "learning_rate": 0.0001263157894736842, + "loss": 0.8038, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.41152712397478197, + "learning_rate": 0.0001298245614035088, + "loss": 0.7113, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.4698983552275035, + "learning_rate": 0.00013333333333333334, + "loss": 0.8961, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.4496152385894507, + "learning_rate": 0.0001368421052631579, + "loss": 0.8877, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.5243923818385874, + "learning_rate": 0.00014035087719298245, + "loss": 0.8824, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.4911900741487491, + "learning_rate": 0.00014385964912280703, + "loss": 0.9176, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4856857712861841, + "learning_rate": 0.00014736842105263158, + "loss": 0.7998, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.49859929276212756, + "learning_rate": 0.00015087719298245616, + "loss": 0.8706, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.4377547229110765, + "learning_rate": 0.0001543859649122807, + "loss": 0.8236, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.6531009589606827, + "learning_rate": 0.00015789473684210527, + "loss": 0.8414, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.5033954891454102, + "learning_rate": 0.00016140350877192982, + "loss": 0.8178, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.44798636015217747, + "learning_rate": 0.0001649122807017544, + "loss": 0.827, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.45582754630713557, + "learning_rate": 0.00016842105263157895, + "loss": 0.8612, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.45872918777303245, + "learning_rate": 0.00017192982456140353, + "loss": 0.904, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.5931742003230699, + "learning_rate": 0.00017543859649122806, + "loss": 0.8941, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.46435386514124116, + "learning_rate": 0.00017894736842105264, + "loss": 0.8459, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.4103906134060393, + "learning_rate": 0.0001824561403508772, + "loss": 0.802, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.4527873188642011, + "learning_rate": 0.00018596491228070177, + "loss": 0.8612, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5377033391169986, + "learning_rate": 0.00018947368421052632, + "loss": 0.8861, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.44354325477395923, + "learning_rate": 0.00019298245614035088, + "loss": 0.8266, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.46965440945573167, + "learning_rate": 0.00019649122807017543, + "loss": 0.7932, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.49861368848703796, + "learning_rate": 0.0002, + "loss": 0.9251, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.5358886796410643, + "learning_rate": 0.00019999985069241055, + "loss": 0.8537, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.4199111933454957, + "learning_rate": 0.00019999940277008808, + "loss": 0.8055, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.4440651139054541, + "learning_rate": 0.00019999865623437013, + "loss": 0.8195, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.38496763848220006, + "learning_rate": 0.00019999761108748597, + "loss": 0.7665, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.4601442010805776, + "learning_rate": 0.00019999626733255662, + "loss": 0.7948, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.4538395973660438, + "learning_rate": 0.00019999462497359466, + "loss": 0.8292, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.4103974919312309, + "learning_rate": 0.00019999268401550447, + "loss": 0.8019, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.5126054306975563, + "learning_rate": 0.000199990444464082, + "loss": 0.7964, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.42358893462331326, + "learning_rate": 0.00019998790632601496, + "loss": 0.8085, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.4935773347491403, + "learning_rate": 0.00019998506960888256, + "loss": 0.8605, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5105035840801055, + "learning_rate": 0.00019998193432115572, + "loss": 0.8788, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.44823839807865595, + "learning_rate": 0.0001999785004721968, + "loss": 0.8743, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.4644685890239903, + "learning_rate": 0.00019997476807225985, + "loss": 0.8567, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.471562980689817, + "learning_rate": 0.0001999707371324904, + "loss": 0.8949, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.41416387277398714, + "learning_rate": 0.00019996640766492543, + "loss": 0.8016, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.4067330878703305, + "learning_rate": 0.00019996177968249334, + "loss": 0.7976, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.4680547465198519, + "learning_rate": 0.0001999568531990141, + "loss": 0.8615, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.466684378170647, + "learning_rate": 0.00019995162822919883, + "loss": 0.8326, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.5843105014034774, + "learning_rate": 0.00019994610478865011, + "loss": 0.9223, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.4516958986843274, + "learning_rate": 0.0001999402828938618, + "loss": 0.8231, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.46416836784344917, + "learning_rate": 0.00019993416256221895, + "loss": 0.8305, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.3697772206556224, + "learning_rate": 0.00019992774381199778, + "loss": 0.699, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.46254325283397635, + "learning_rate": 0.00019992102666236566, + "loss": 0.8587, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.41761565245303794, + "learning_rate": 0.00019991401113338104, + "loss": 0.7599, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.43848110998457335, + "learning_rate": 0.00019990669724599336, + "loss": 0.8359, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.4720306109707181, + "learning_rate": 0.00019989908502204292, + "loss": 0.8616, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.9634253575197086, + "learning_rate": 0.00019989117448426108, + "loss": 0.7746, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.5086670772892667, + "learning_rate": 0.00019988296565626987, + "loss": 0.8447, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.4851308210699472, + "learning_rate": 0.00019987445856258206, + "loss": 0.7816, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.43399889675256137, + "learning_rate": 0.00019986565322860115, + "loss": 0.8149, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.45522208083620896, + "learning_rate": 0.00019985654968062122, + "loss": 0.8185, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.41206090460997447, + "learning_rate": 0.00019984714794582683, + "loss": 0.7626, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.3927806603634916, + "learning_rate": 0.00019983744805229296, + "loss": 0.7618, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.445857279114616, + "learning_rate": 0.000199827450028985, + "loss": 0.7935, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.4019642963276086, + "learning_rate": 0.00019981715390575858, + "loss": 0.7742, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.45097234549832604, + "learning_rate": 0.00019980655971335945, + "loss": 0.8406, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.4527733588093272, + "learning_rate": 0.00019979566748342347, + "loss": 0.8657, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.43058663300109606, + "learning_rate": 0.00019978447724847652, + "loss": 0.7905, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4223369658847847, + "learning_rate": 0.00019977298904193437, + "loss": 0.7845, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.455684068704479, + "learning_rate": 0.00019976120289810247, + "loss": 0.8505, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.4519630049361916, + "learning_rate": 0.00019974911885217608, + "loss": 0.7986, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4351535348540937, + "learning_rate": 0.00019973673694024, + "loss": 0.8325, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.43908220372345264, + "learning_rate": 0.0001997240571992685, + "loss": 0.81, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.4476311325416776, + "learning_rate": 0.00019971107966712518, + "loss": 0.7769, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4726063952110799, + "learning_rate": 0.00019969780438256293, + "loss": 0.8528, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.4201995743808383, + "learning_rate": 0.0001996842313852238, + "loss": 0.813, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.4148956484560226, + "learning_rate": 0.00019967036071563877, + "loss": 0.7447, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.5015600684967997, + "learning_rate": 0.0001996561924152278, + "loss": 0.9028, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.4310765367841715, + "learning_rate": 0.0001996417265262996, + "loss": 0.7803, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.44752749930977775, + "learning_rate": 0.00019962696309205148, + "loss": 0.8099, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.40063612430475026, + "learning_rate": 0.0001996119021565693, + "loss": 0.7469, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.43486569204981024, + "learning_rate": 0.0001995965437648273, + "loss": 0.8109, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.4396585907015772, + "learning_rate": 0.00019958088796268793, + "loss": 0.8279, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.390314497464367, + "learning_rate": 0.0001995649347969019, + "loss": 0.7306, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.42267774616233733, + "learning_rate": 0.00019954868431510764, + "loss": 0.7179, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.45427963486348716, + "learning_rate": 0.00019953213656583168, + "loss": 0.8086, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4527011734468865, + "learning_rate": 0.00019951529159848805, + "loss": 0.8277, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.49169284050246814, + "learning_rate": 0.00019949814946337838, + "loss": 0.8489, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.4279168406154108, + "learning_rate": 0.00019948071021169174, + "loss": 0.7653, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4121276367654202, + "learning_rate": 0.00019946297389550433, + "loss": 0.7742, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.4536726030404313, + "learning_rate": 0.00019944494056777946, + "loss": 0.8194, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.4486295176595619, + "learning_rate": 0.00019942661028236745, + "loss": 0.8214, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.42898423421445664, + "learning_rate": 0.00019940798309400526, + "loss": 0.7394, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.45190207354720013, + "learning_rate": 0.00019938905905831654, + "loss": 0.8329, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.39971546567051797, + "learning_rate": 0.00019936983823181132, + "loss": 0.7784, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.46504120999614906, + "learning_rate": 0.0001993503206718859, + "loss": 0.824, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.40648705882743386, + "learning_rate": 0.00019933050643682269, + "loss": 0.7704, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.462891583179439, + "learning_rate": 0.00019931039558578997, + "loss": 0.8047, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.44835435341660984, + "learning_rate": 0.00019928998817884182, + "loss": 0.8025, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.47607297228300677, + "learning_rate": 0.00019926928427691786, + "loss": 0.8257, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.5193575319855029, + "learning_rate": 0.00019924828394184306, + "loss": 0.9143, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4766643858963425, + "learning_rate": 0.00019922698723632767, + "loss": 0.8641, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.49597963705967496, + "learning_rate": 0.0001992053942239668, + "loss": 0.8312, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.4808812880130579, + "learning_rate": 0.0001991835049692405, + "loss": 0.7986, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.44599017255723294, + "learning_rate": 0.00019916131953751342, + "loss": 0.8068, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.48750979954844126, + "learning_rate": 0.0001991388379950346, + "loss": 0.8945, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.6254679334048621, + "learning_rate": 0.0001991160604089374, + "loss": 0.7425, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.43829115222454273, + "learning_rate": 0.00019909298684723904, + "loss": 0.7859, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.4704662955531175, + "learning_rate": 0.00019906961737884077, + "loss": 0.7688, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.4409529564290118, + "learning_rate": 0.00019904595207352737, + "loss": 0.8644, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4344184221831154, + "learning_rate": 0.00019902199100196697, + "loss": 0.7998, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.4046224512304256, + "learning_rate": 0.000198997734235711, + "loss": 0.7475, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.45801000153339994, + "learning_rate": 0.00019897318184719385, + "loss": 0.8505, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4141957413731039, + "learning_rate": 0.00019894833390973266, + "loss": 0.7275, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.4314403142961352, + "learning_rate": 0.0001989231904975272, + "loss": 0.8589, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.4385517025195832, + "learning_rate": 0.00019889775168565943, + "loss": 0.8784, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4031136160078963, + "learning_rate": 0.00019887201755009357, + "loss": 0.7516, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.43675739098920885, + "learning_rate": 0.00019884598816767563, + "loss": 0.7659, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.49445359229428, + "learning_rate": 0.0001988196636161333, + "loss": 0.7858, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.43646456852784826, + "learning_rate": 0.0001987930439740757, + "loss": 0.7629, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.42977797610808016, + "learning_rate": 0.00019876612932099308, + "loss": 0.7425, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.4920222624489503, + "learning_rate": 0.0001987389197372567, + "loss": 0.8062, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.4479600663744989, + "learning_rate": 0.00019871141530411853, + "loss": 0.8066, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.41424496856493076, + "learning_rate": 0.00019868361610371097, + "loss": 0.7106, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.4880984065020837, + "learning_rate": 0.00019865552221904665, + "loss": 0.9123, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.43933091597767177, + "learning_rate": 0.0001986271337340182, + "loss": 0.784, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.4451132220141123, + "learning_rate": 0.00019859845073339787, + "loss": 0.8551, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.4483482119194952, + "learning_rate": 0.00019856947330283752, + "loss": 0.8209, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.41615558492968413, + "learning_rate": 0.00019854020152886814, + "loss": 0.813, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.39848951490249424, + "learning_rate": 0.0001985106354988997, + "loss": 0.6954, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.4241471112610877, + "learning_rate": 0.00019848077530122083, + "loss": 0.78, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.4242752278974099, + "learning_rate": 0.0001984506210249986, + "loss": 0.7545, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.44284383218892404, + "learning_rate": 0.00019842017276027832, + "loss": 0.8226, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.4475246490821707, + "learning_rate": 0.00019838943059798304, + "loss": 0.7354, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5187652451271928, + "learning_rate": 0.00019835839462991361, + "loss": 0.886, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.4805342879602958, + "learning_rate": 0.0001983270649487481, + "loss": 0.8325, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.4844268011021575, + "learning_rate": 0.0001982954416480417, + "loss": 0.8778, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.43007715527336304, + "learning_rate": 0.00019826352482222638, + "loss": 0.8081, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.43584800742577234, + "learning_rate": 0.00019823131456661063, + "loss": 0.7957, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.49337379328966763, + "learning_rate": 0.00019819881097737915, + "loss": 0.8278, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4714725906519646, + "learning_rate": 0.00019816601415159263, + "loss": 0.8136, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.4584132090446889, + "learning_rate": 0.00019813292418718732, + "loss": 0.8557, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.4817876126266651, + "learning_rate": 0.0001980995411829749, + "loss": 0.8136, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4483299087194101, + "learning_rate": 0.0001980658652386421, + "loss": 0.8202, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.4859168307388182, + "learning_rate": 0.0001980318964547504, + "loss": 0.8224, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.39941565954850006, + "learning_rate": 0.0001979976349327357, + "loss": 0.7702, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4611323663241478, + "learning_rate": 0.00019796308077490817, + "loss": 0.828, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.4482217525906128, + "learning_rate": 0.00019792823408445174, + "loss": 0.7599, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.44027942062871145, + "learning_rate": 0.0001978930949654239, + "loss": 0.8377, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4659582801677607, + "learning_rate": 0.00019785766352275542, + "loss": 0.9079, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.4505204473100588, + "learning_rate": 0.00019782193986224995, + "loss": 0.8439, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.45960903450500473, + "learning_rate": 0.00019778592409058378, + "loss": 0.8309, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.5377935625560539, + "learning_rate": 0.00019774961631530545, + "loss": 0.8042, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.41565057105304903, + "learning_rate": 0.0001977130166448355, + "loss": 0.8002, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.41460320605279716, + "learning_rate": 0.00019767612518846608, + "loss": 0.7237, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.49178728981780256, + "learning_rate": 0.00019763894205636072, + "loss": 0.8454, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.44755309029442725, + "learning_rate": 0.00019760146735955388, + "loss": 0.8503, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.38714457804353636, + "learning_rate": 0.00019756370120995066, + "loss": 0.7342, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4760548365486237, + "learning_rate": 0.00019752564372032657, + "loss": 0.835, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.43674343320545966, + "learning_rate": 0.000197487295004327, + "loss": 0.8249, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.4531365469989709, + "learning_rate": 0.00019744865517646706, + "loss": 0.8027, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.43823941560176616, + "learning_rate": 0.00019740972435213115, + "loss": 0.7959, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.3929592720994812, + "learning_rate": 0.0001973705026475726, + "loss": 0.6907, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.39972921626850777, + "learning_rate": 0.00019733099017991341, + "loss": 0.814, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.43710404147202, + "learning_rate": 0.00019729118706714375, + "loss": 0.8194, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.4342108444860517, + "learning_rate": 0.0001972510934281218, + "loss": 0.812, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.40976023474313855, + "learning_rate": 0.00019721070938257324, + "loss": 0.7932, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.42537748527448105, + "learning_rate": 0.00019717003505109095, + "loss": 0.8032, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.42091074697602304, + "learning_rate": 0.0001971290705551347, + "loss": 0.7582, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.3966725914002177, + "learning_rate": 0.00019708781601703065, + "loss": 0.7514, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.46048178101311826, + "learning_rate": 0.00019704627155997108, + "loss": 0.8245, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.46331211907377207, + "learning_rate": 0.00019700443730801413, + "loss": 0.8152, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.472286790301323, + "learning_rate": 0.00019696231338608316, + "loss": 0.8484, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.4664840395567395, + "learning_rate": 0.00019691989991996663, + "loss": 0.8278, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.44170299279219305, + "learning_rate": 0.00019687719703631755, + "loss": 0.8216, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.43538865688890044, + "learning_rate": 0.00019683420486265327, + "loss": 0.8423, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.43026680130665573, + "learning_rate": 0.0001967909235273549, + "loss": 0.7704, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.4402184198715376, + "learning_rate": 0.0001967473531596671, + "loss": 0.8485, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.4597257248557568, + "learning_rate": 0.0001967034938896976, + "loss": 0.7699, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.43125968937925885, + "learning_rate": 0.00019665934584841682, + "loss": 0.8024, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.45240591752822273, + "learning_rate": 0.0001966149091676575, + "loss": 0.8629, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.49186335474232923, + "learning_rate": 0.00019657018398011434, + "loss": 0.8732, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.45901536135888155, + "learning_rate": 0.00019652517041934356, + "loss": 0.8123, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.4596031446817396, + "learning_rate": 0.00019647986861976246, + "loss": 0.8852, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.46618706944182065, + "learning_rate": 0.0001964342787166491, + "loss": 0.7485, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.6902121551823582, + "learning_rate": 0.00019638840084614182, + "loss": 0.7188, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.4083684587846421, + "learning_rate": 0.0001963422351452389, + "loss": 0.7432, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.4961646801394343, + "learning_rate": 0.0001962957817517982, + "loss": 0.8278, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4400375530479702, + "learning_rate": 0.00019624904080453655, + "loss": 0.8287, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.4190005436683279, + "learning_rate": 0.00019620201244302952, + "loss": 0.8171, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.4884107398069035, + "learning_rate": 0.00019615469680771096, + "loss": 0.8943, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.41500498773944605, + "learning_rate": 0.00019610709403987246, + "loss": 0.7398, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.41954265563184173, + "learning_rate": 0.00019605920428166323, + "loss": 0.7739, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.4513842667939194, + "learning_rate": 0.00019601102767608923, + "loss": 0.7367, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5733483427976973, + "learning_rate": 0.00019596256436701324, + "loss": 0.8087, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.43191274270691143, + "learning_rate": 0.00019591381449915397, + "loss": 0.7329, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.43845824001522565, + "learning_rate": 0.00019586477821808597, + "loss": 0.7939, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.4464057108322274, + "learning_rate": 0.000195815455670239, + "loss": 0.8048, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.46481050734827417, + "learning_rate": 0.00019576584700289768, + "loss": 0.7878, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.43270851550710027, + "learning_rate": 0.00019571595236420102, + "loss": 0.778, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4367449983041571, + "learning_rate": 0.00019566577190314197, + "loss": 0.7888, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.40036781719061393, + "learning_rate": 0.00019561530576956703, + "loss": 0.7315, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.44202834492107124, + "learning_rate": 0.00019556455411417573, + "loss": 0.8316, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.44133307517411113, + "learning_rate": 0.0001955135170885202, + "loss": 0.772, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.45447721349442555, + "learning_rate": 0.00019546219484500475, + "loss": 0.8157, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.38817072137937025, + "learning_rate": 0.00019541058753688538, + "loss": 0.7261, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4562599393937972, + "learning_rate": 0.00019535869531826937, + "loss": 0.7925, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.4451932832529039, + "learning_rate": 0.00019530651834411474, + "loss": 0.7445, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.4425698163551616, + "learning_rate": 0.00019525405677022989, + "loss": 0.8399, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.4044960659500061, + "learning_rate": 0.00019520131075327298, + "loss": 0.7836, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.4454396806934391, + "learning_rate": 0.0001951482804507517, + "loss": 0.8236, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.40740737398119914, + "learning_rate": 0.00019509496602102252, + "loss": 0.746, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.4073032195596395, + "learning_rate": 0.00019504136762329047, + "loss": 0.8102, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.4149416935466569, + "learning_rate": 0.00019498748541760846, + "loss": 0.7821, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.39364797701196813, + "learning_rate": 0.0001949333195648769, + "loss": 0.753, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.3705972966731362, + "learning_rate": 0.00019487887022684336, + "loss": 0.7606, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.4591223904326906, + "learning_rate": 0.00019482413756610173, + "loss": 0.8666, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.3908139311476786, + "learning_rate": 0.0001947691217460921, + "loss": 0.7006, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.38215926157824187, + "learning_rate": 0.00019471382293110003, + "loss": 0.7295, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.41336248255289815, + "learning_rate": 0.00019465824128625617, + "loss": 0.7479, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.4259106348010246, + "learning_rate": 0.00019460237697753577, + "loss": 0.7163, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.45386495326525006, + "learning_rate": 0.00019454623017175812, + "loss": 0.8195, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4390682586736766, + "learning_rate": 0.00019448980103658613, + "loss": 0.8298, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.47482501127831234, + "learning_rate": 0.0001944330897405257, + "loss": 0.8094, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4258694435585488, + "learning_rate": 0.00019437609645292546, + "loss": 0.8671, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.4042887335019348, + "learning_rate": 0.00019431882134397598, + "loss": 0.7795, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.4060175563463174, + "learning_rate": 0.00019426126458470936, + "loss": 0.7552, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.42325218335845294, + "learning_rate": 0.0001942034263469989, + "loss": 0.8127, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.42025220582760886, + "learning_rate": 0.00019414530680355837, + "loss": 0.7834, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.3899607546060976, + "learning_rate": 0.00019408690612794148, + "loss": 0.7461, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.44293320632178584, + "learning_rate": 0.00019402822449454153, + "loss": 0.8554, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.38414290035390497, + "learning_rate": 0.00019396926207859084, + "loss": 0.7265, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.39697025700360344, + "learning_rate": 0.0001939100190561601, + "loss": 0.7204, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.35223629771328, + "learning_rate": 0.00019385049560415794, + "loss": 0.719, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.43080921604558886, + "learning_rate": 0.0001937906919003304, + "loss": 0.8576, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.41838324723144754, + "learning_rate": 0.00019373060812326052, + "loss": 0.7771, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4293837353499623, + "learning_rate": 0.00019367024445236754, + "loss": 0.8393, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.4170972005584217, + "learning_rate": 0.00019360960106790643, + "loss": 0.7614, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.4003287638349002, + "learning_rate": 0.0001935486781509677, + "loss": 0.8092, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.3791029707393735, + "learning_rate": 0.00019348747588347637, + "loss": 0.7869, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.4461110806714854, + "learning_rate": 0.00019342599444819168, + "loss": 0.8196, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.4032553825749664, + "learning_rate": 0.00019336423402870653, + "loss": 0.8043, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.4489374564257814, + "learning_rate": 0.00019330219480944694, + "loss": 0.7999, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.3931813135152387, + "learning_rate": 0.0001932398769756714, + "loss": 0.7548, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.43213500052159126, + "learning_rate": 0.0001931772807134704, + "loss": 0.7404, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.463999438603729, + "learning_rate": 0.00019311440620976597, + "loss": 0.7251, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.4202383869284423, + "learning_rate": 0.00019305125365231084, + "loss": 0.7599, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.4161050003557798, + "learning_rate": 0.00019298782322968815, + "loss": 0.7888, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4580749494162176, + "learning_rate": 0.0001929241151313108, + "loss": 0.7753, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.4500640596888444, + "learning_rate": 0.0001928601295474208, + "loss": 0.8523, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.4417347445263267, + "learning_rate": 0.00019279586666908884, + "loss": 0.7784, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.4191198632455908, + "learning_rate": 0.00019273132668821364, + "loss": 0.8291, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.40465574964154416, + "learning_rate": 0.00019266650979752136, + "loss": 0.7461, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.38044221580169696, + "learning_rate": 0.00019260141619056507, + "loss": 0.8062, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.41989426187836015, + "learning_rate": 0.00019253604606172417, + "loss": 0.7404, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.4174598718201883, + "learning_rate": 0.0001924703996062038, + "loss": 0.8005, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.39746626227909265, + "learning_rate": 0.0001924044770200342, + "loss": 0.77, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.49211859112155604, + "learning_rate": 0.00019233827850007027, + "loss": 0.9203, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.44548839442310045, + "learning_rate": 0.0001922718042439908, + "loss": 0.8274, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.4206063627886071, + "learning_rate": 0.000192205054450298, + "loss": 0.7462, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.38061066166890833, + "learning_rate": 0.00019213802931831696, + "loss": 0.7482, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.46789180687659593, + "learning_rate": 0.00019207072904819486, + "loss": 0.8544, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.459318329460134, + "learning_rate": 0.00019200315384090044, + "loss": 0.77, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.3964244032086574, + "learning_rate": 0.00019193530389822363, + "loss": 0.6858, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.4021983466035418, + "learning_rate": 0.00019186717942277462, + "loss": 0.7392, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.4463722717797184, + "learning_rate": 0.00019179878061798347, + "loss": 0.814, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.40817539027781785, + "learning_rate": 0.00019173010768809933, + "loss": 0.7085, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.44904259139519614, + "learning_rate": 0.00019166116083819002, + "loss": 0.7432, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.38360848103639306, + "learning_rate": 0.00019159194027414128, + "loss": 0.7165, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.443531638191723, + "learning_rate": 0.0001915224462026563, + "loss": 0.8256, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.38095191702642245, + "learning_rate": 0.00019145267883125482, + "loss": 0.7611, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.3945116950378279, + "learning_rate": 0.00019138263836827288, + "loss": 0.779, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.397456986825466, + "learning_rate": 0.00019131232502286188, + "loss": 0.8102, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.6803448460607161, + "learning_rate": 0.00019124173900498818, + "loss": 0.8261, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.40842654911594944, + "learning_rate": 0.00019117088052543233, + "loss": 0.7522, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.5638276816528227, + "learning_rate": 0.0001910997497957885, + "loss": 0.9793, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.36684193003955196, + "learning_rate": 0.00019102834702846387, + "loss": 0.7141, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.4537436788475935, + "learning_rate": 0.0001909566724366779, + "loss": 0.8118, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4062097659957066, + "learning_rate": 0.00019088472623446183, + "loss": 0.7537, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.4514769645704028, + "learning_rate": 0.00019081250863665794, + "loss": 0.7657, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.44614717708012835, + "learning_rate": 0.0001907400198589189, + "loss": 0.8617, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.41963744513674606, + "learning_rate": 0.00019066726011770726, + "loss": 0.7844, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.42682085703394257, + "learning_rate": 0.00019059422963029464, + "loss": 0.7684, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.39937427047954077, + "learning_rate": 0.0001905209286147611, + "loss": 0.7551, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.43952243579704303, + "learning_rate": 0.0001904473572899947, + "loss": 0.8068, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.5151466577662714, + "learning_rate": 0.0001903735158756905, + "loss": 0.727, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.46431394111704427, + "learning_rate": 0.0001902994045923502, + "loss": 0.8529, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.4086453319288397, + "learning_rate": 0.00019022502366128135, + "loss": 0.7239, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.47172791859158825, + "learning_rate": 0.0001901503733045967, + "loss": 0.8012, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.4939060978903562, + "learning_rate": 0.00019007545374521355, + "loss": 0.8453, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.42505393841937944, + "learning_rate": 0.00019000026520685302, + "loss": 0.8035, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.40056450114313424, + "learning_rate": 0.00018992480791403958, + "loss": 0.7788, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.43859811381683717, + "learning_rate": 0.0001898490820921001, + "loss": 0.7728, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.38868079227357577, + "learning_rate": 0.0001897730879671634, + "loss": 0.7384, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.4169781202695756, + "learning_rate": 0.0001896968257661595, + "loss": 0.7412, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.4758209190095243, + "learning_rate": 0.00018962029571681886, + "loss": 0.7737, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.44608960722227414, + "learning_rate": 0.00018954349804767184, + "loss": 0.7766, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.4384190504043158, + "learning_rate": 0.00018946643298804793, + "loss": 0.8301, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.4664490211064372, + "learning_rate": 0.00018938910076807513, + "loss": 0.8381, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.37886426668765194, + "learning_rate": 0.00018931150161867916, + "loss": 0.7254, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.46384866324389645, + "learning_rate": 0.0001892336357715829, + "loss": 0.8627, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.44489240090318244, + "learning_rate": 0.0001891555034593055, + "loss": 0.7049, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.40644703358089984, + "learning_rate": 0.00018907710491516199, + "loss": 0.7327, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.42800900543499615, + "learning_rate": 0.00018899844037326225, + "loss": 0.797, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.42280954690894723, + "learning_rate": 0.0001889195100685106, + "loss": 0.7859, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.427761653048338, + "learning_rate": 0.0001888403142366049, + "loss": 0.7594, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.3892262470310244, + "learning_rate": 0.00018876085311403593, + "loss": 0.7503, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.4207635391805858, + "learning_rate": 0.00018868112693808665, + "loss": 0.7673, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.43435097159311664, + "learning_rate": 0.00018860113594683148, + "loss": 0.8014, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.40937023088469, + "learning_rate": 0.00018852088037913577, + "loss": 0.7859, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.41001083482566403, + "learning_rate": 0.0001884403604746547, + "loss": 0.7813, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.3812789606403339, + "learning_rate": 0.00018835957647383303, + "loss": 0.7912, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.38260579878749484, + "learning_rate": 0.00018827852861790398, + "loss": 0.7276, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.4188375153138234, + "learning_rate": 0.00018819721714888877, + "loss": 0.7856, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4191073265804016, + "learning_rate": 0.00018811564230959588, + "loss": 0.8182, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.4409245526215755, + "learning_rate": 0.00018803380434362, + "loss": 0.8265, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.4875836047329921, + "learning_rate": 0.0001879517034953418, + "loss": 0.8378, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.43206958362534076, + "learning_rate": 0.00018786934000992688, + "loss": 0.7166, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.45230825476446884, + "learning_rate": 0.00018778671413332513, + "loss": 0.8139, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.4403885177275518, + "learning_rate": 0.00018770382611226987, + "loss": 0.7746, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.36705477948609244, + "learning_rate": 0.00018762067619427746, + "loss": 0.6628, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.4309350434473072, + "learning_rate": 0.000187537264627646, + "loss": 0.7983, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.4561987670471556, + "learning_rate": 0.00018745359166145523, + "loss": 0.8956, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.4506239667312874, + "learning_rate": 0.00018736965754556528, + "loss": 0.79, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.41185547045134, + "learning_rate": 0.00018728546253061614, + "loss": 0.8051, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.4608890231919324, + "learning_rate": 0.00018720100686802694, + "loss": 0.8128, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.40548159698715563, + "learning_rate": 0.00018711629080999504, + "loss": 0.7737, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.46337948526039613, + "learning_rate": 0.00018703131460949554, + "loss": 0.8235, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.4474937103830373, + "learning_rate": 0.0001869460785202802, + "loss": 0.7144, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.42890106376323245, + "learning_rate": 0.00018686058279687698, + "loss": 0.7508, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.47516968882081667, + "learning_rate": 0.00018677482769458904, + "loss": 0.7866, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.44556770166104803, + "learning_rate": 0.00018668881346949417, + "loss": 0.802, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.4109752045262668, + "learning_rate": 0.00018660254037844388, + "loss": 0.7588, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.4374877722878183, + "learning_rate": 0.00018651600867906272, + "loss": 0.7432, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.45922625029093916, + "learning_rate": 0.00018642921862974742, + "loss": 0.7508, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4433410526588002, + "learning_rate": 0.00018634217048966637, + "loss": 0.8436, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.45662892382022074, + "learning_rate": 0.00018625486451875843, + "loss": 0.8183, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.48004506628680943, + "learning_rate": 0.0001861673009777325, + "loss": 0.856, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.413905602909353, + "learning_rate": 0.0001860794801280666, + "loss": 0.755, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.44171838117859225, + "learning_rate": 0.00018599140223200716, + "loss": 0.7949, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.4121760243310849, + "learning_rate": 0.0001859030675525681, + "loss": 0.7873, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.4175764433109541, + "learning_rate": 0.0001858144763535302, + "loss": 0.7996, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.36921453998206777, + "learning_rate": 0.0001857256288994402, + "loss": 0.6906, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.42607268744743987, + "learning_rate": 0.00018563652545561013, + "loss": 0.8004, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4485820639155176, + "learning_rate": 0.0001855471662881164, + "loss": 0.8138, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.4233782391868254, + "learning_rate": 0.000185457551663799, + "loss": 0.8149, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.4019541542751352, + "learning_rate": 0.00018536768185026083, + "loss": 0.746, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.44813233661492907, + "learning_rate": 0.00018527755711586678, + "loss": 0.8055, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.40332082604890734, + "learning_rate": 0.00018518717772974302, + "loss": 0.6846, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.3894478056645389, + "learning_rate": 0.00018509654396177609, + "loss": 0.7378, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.442120712309075, + "learning_rate": 0.00018500565608261214, + "loss": 0.8193, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.36333630977196085, + "learning_rate": 0.00018491451436365627, + "loss": 0.6819, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.4354306266272364, + "learning_rate": 0.0001848231190770714, + "loss": 0.6946, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.40743520887541135, + "learning_rate": 0.00018473147049577774, + "loss": 0.7572, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.42090155183192574, + "learning_rate": 0.00018463956889345194, + "loss": 0.7501, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.38910082938456136, + "learning_rate": 0.00018454741454452603, + "loss": 0.7161, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4029892976127122, + "learning_rate": 0.00018445500772418697, + "loss": 0.7463, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.4276425202170697, + "learning_rate": 0.00018436234870837547, + "loss": 0.7936, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.46679796325534834, + "learning_rate": 0.00018426943777378552, + "loss": 0.8171, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4478670167653976, + "learning_rate": 0.00018417627519786315, + "loss": 0.7552, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.42859027674050987, + "learning_rate": 0.00018408286125880604, + "loss": 0.7506, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.39325455786258395, + "learning_rate": 0.00018398919623556238, + "loss": 0.7574, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.3792874715373212, + "learning_rate": 0.00018389528040783012, + "loss": 0.6801, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.43147352714426557, + "learning_rate": 0.0001838011140560562, + "loss": 0.8133, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.44500228600342345, + "learning_rate": 0.00018370669746143564, + "loss": 0.7759, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.40390199826261264, + "learning_rate": 0.00018361203090591071, + "loss": 0.7525, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.3972330956103031, + "learning_rate": 0.0001835171146721701, + "loss": 0.7708, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.4143928993768653, + "learning_rate": 0.00018342194904364813, + "loss": 0.767, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.43262304517242256, + "learning_rate": 0.00018332653430452376, + "loss": 0.8193, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.40219774460267915, + "learning_rate": 0.00018323087073971993, + "loss": 0.7255, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.43205612834254487, + "learning_rate": 0.00018313495863490258, + "loss": 0.7399, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4598219347282703, + "learning_rate": 0.00018303879827647975, + "loss": 0.7702, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.39448665452304327, + "learning_rate": 0.00018294238995160094, + "loss": 0.7211, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.4206393615135302, + "learning_rate": 0.00018284573394815597, + "loss": 0.7889, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4846467223856484, + "learning_rate": 0.00018274883055477436, + "loss": 0.8837, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.41667485615355143, + "learning_rate": 0.00018265168006082437, + "loss": 0.7189, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4604796342144686, + "learning_rate": 0.00018255428275641214, + "loss": 0.7937, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.45259207316683736, + "learning_rate": 0.00018245663893238075, + "loss": 0.7974, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.4341042311882465, + "learning_rate": 0.0001823587488803095, + "loss": 0.738, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.4683448540774323, + "learning_rate": 0.00018226061289251298, + "loss": 0.7193, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.41408110761574435, + "learning_rate": 0.00018216223126204007, + "loss": 0.7251, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.4542005341965333, + "learning_rate": 0.00018206360428267332, + "loss": 0.7984, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.3939031674123216, + "learning_rate": 0.00018196473224892784, + "loss": 0.7133, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.4819986008493197, + "learning_rate": 0.00018186561545605054, + "loss": 0.8371, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.3960782226169457, + "learning_rate": 0.0001817662542000192, + "loss": 0.7093, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.4625689307705016, + "learning_rate": 0.0001816666487775416, + "loss": 0.8074, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.42313403473960337, + "learning_rate": 0.00018156679948605467, + "loss": 0.7474, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.4270307709479224, + "learning_rate": 0.00018146670662372354, + "loss": 0.7943, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.4220964234581579, + "learning_rate": 0.0001813663704894407, + "loss": 0.8479, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4055106834380055, + "learning_rate": 0.00018126579138282503, + "loss": 0.7673, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.39707451997168985, + "learning_rate": 0.00018116496960422107, + "loss": 0.7355, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.41645880432509597, + "learning_rate": 0.00018106390545469795, + "loss": 0.7881, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.40219537606475975, + "learning_rate": 0.0001809625992360485, + "loss": 0.759, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.40028445203067436, + "learning_rate": 0.00018086105125078857, + "loss": 0.758, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.45868882119219095, + "learning_rate": 0.00018075926180215576, + "loss": 0.7778, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.5055865468343858, + "learning_rate": 0.00018065723119410884, + "loss": 0.8473, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.4260233233671494, + "learning_rate": 0.0001805549597313267, + "loss": 0.7326, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.4508177776857487, + "learning_rate": 0.0001804524477192075, + "loss": 0.7635, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.5457963141188263, + "learning_rate": 0.00018034969546386757, + "loss": 0.8356, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.36218033840482383, + "learning_rate": 0.00018024670327214084, + "loss": 0.6857, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.4265971390621618, + "learning_rate": 0.00018014347145157755, + "loss": 0.7591, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.44138850691136283, + "learning_rate": 0.0001800400003104436, + "loss": 0.7895, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.4253692589500084, + "learning_rate": 0.0001799362901577196, + "loss": 0.7274, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.4172888167913838, + "learning_rate": 0.00017983234130309968, + "loss": 0.7607, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.43693502598088874, + "learning_rate": 0.00017972815405699103, + "loss": 0.7521, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.3504487378185642, + "learning_rate": 0.00017962372873051252, + "loss": 0.6614, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.43630167985015783, + "learning_rate": 0.00017951906563549397, + "loss": 0.8651, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.3823016448913495, + "learning_rate": 0.00017941416508447536, + "loss": 0.7414, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.4174112019252735, + "learning_rate": 0.00017930902739070562, + "loss": 0.7915, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.3953902231311026, + "learning_rate": 0.00017920365286814183, + "loss": 0.7429, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4781709856899902, + "learning_rate": 0.0001790980418314484, + "loss": 0.7811, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.47252415534413705, + "learning_rate": 0.0001789921945959958, + "loss": 0.8806, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.4865219929578287, + "learning_rate": 0.00017888611147786002, + "loss": 0.8077, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.4425115033476329, + "learning_rate": 0.00017877979279382135, + "loss": 0.7107, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.4543841872674269, + "learning_rate": 0.00017867323886136348, + "loss": 0.7758, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.3867618941791085, + "learning_rate": 0.00017856644999867264, + "loss": 0.7309, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.39943496398845935, + "learning_rate": 0.0001784594265246366, + "loss": 0.7168, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.3910802366662262, + "learning_rate": 0.00017835216875884368, + "loss": 0.7196, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.4535836072431583, + "learning_rate": 0.0001782446770215819, + "loss": 0.7941, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4406511069170446, + "learning_rate": 0.0001781369516338378, + "loss": 0.7642, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.4623494611245526, + "learning_rate": 0.00017802899291729585, + "loss": 0.8283, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.4636825420493861, + "learning_rate": 0.0001779208011943371, + "loss": 0.6656, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.40609425486236683, + "learning_rate": 0.00017781237678803847, + "loss": 0.7222, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.4573656546601646, + "learning_rate": 0.00017770372002217172, + "loss": 0.8383, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.4222446845886771, + "learning_rate": 0.00017759483122120238, + "loss": 0.7961, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.41142821506943217, + "learning_rate": 0.000177485710710289, + "loss": 0.7553, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.41008012075185146, + "learning_rate": 0.00017737635881528196, + "loss": 0.7323, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.4370533231761914, + "learning_rate": 0.00017726677586272263, + "loss": 0.8049, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.47113539643196534, + "learning_rate": 0.00017715696217984235, + "loss": 0.7169, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.39445042339966896, + "learning_rate": 0.00017704691809456143, + "loss": 0.693, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.42072701077110986, + "learning_rate": 0.0001769366439354882, + "loss": 0.7396, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.47204364252109704, + "learning_rate": 0.00017682614003191807, + "loss": 0.7521, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.4200290643447494, + "learning_rate": 0.00017671540671383243, + "loss": 0.6861, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.4502529045960976, + "learning_rate": 0.0001766044443118978, + "loss": 0.7767, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3695645276857633, + "learning_rate": 0.00017649325315746478, + "loss": 0.7027, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.4117470459210072, + "learning_rate": 0.00017638183358256696, + "loss": 0.7655, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.47394585526275107, + "learning_rate": 0.00017627018591992018, + "loss": 0.7737, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.394885661547385, + "learning_rate": 0.0001761583105029213, + "loss": 0.7386, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.4201613770727662, + "learning_rate": 0.00017604620766564723, + "loss": 0.7362, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.3746099633885623, + "learning_rate": 0.00017593387774285412, + "loss": 0.7034, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.44593986708903155, + "learning_rate": 0.00017582132106997616, + "loss": 0.8034, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.38542476841928264, + "learning_rate": 0.0001757085379831246, + "loss": 0.7806, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.461235269742075, + "learning_rate": 0.00017559552881908695, + "loss": 0.793, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.3904459463021993, + "learning_rate": 0.00017548229391532572, + "loss": 0.7182, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.5029031182752419, + "learning_rate": 0.00017536883360997743, + "loss": 0.7641, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.4620326356465615, + "learning_rate": 0.00017525514824185185, + "loss": 0.8201, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.5001062434922018, + "learning_rate": 0.00017514123815043074, + "loss": 0.8608, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.44384500980465535, + "learning_rate": 0.00017502710367586687, + "loss": 0.7526, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.4415719249105902, + "learning_rate": 0.0001749127451589832, + "loss": 0.7334, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.47188387831377115, + "learning_rate": 0.00017479816294127152, + "loss": 0.823, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.3732688102135017, + "learning_rate": 0.00017468335736489177, + "loss": 0.6511, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.47634472209126383, + "learning_rate": 0.00017456832877267084, + "loss": 0.7349, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.4364189059028313, + "learning_rate": 0.0001744530775081015, + "loss": 0.7164, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.4172250477477957, + "learning_rate": 0.00017433760391534167, + "loss": 0.793, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.3687764366502754, + "learning_rate": 0.00017422190833921283, + "loss": 0.7177, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.40383041751979587, + "learning_rate": 0.0001741059911251997, + "loss": 0.6801, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.4355190780186118, + "learning_rate": 0.00017398985261944856, + "loss": 0.8137, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.38382928686611334, + "learning_rate": 0.00017387349316876666, + "loss": 0.7865, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.5227623406808979, + "learning_rate": 0.000173756913120621, + "loss": 0.7953, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.4356224918905176, + "learning_rate": 0.0001736401128231373, + "loss": 0.8426, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.40711285939059755, + "learning_rate": 0.00017352309262509894, + "loss": 0.7079, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.47183725682436484, + "learning_rate": 0.00017340585287594604, + "loss": 0.8434, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.4550066465514676, + "learning_rate": 0.0001732883939257742, + "loss": 0.7661, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.3696661486354159, + "learning_rate": 0.0001731707161253338, + "loss": 0.7439, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.39398364659212626, + "learning_rate": 0.0001730528198260285, + "loss": 0.773, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.4026020601598246, + "learning_rate": 0.00017293470537991463, + "loss": 0.7006, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.4362868754323538, + "learning_rate": 0.00017281637313969978, + "loss": 0.8482, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.3515804653964102, + "learning_rate": 0.00017269782345874203, + "loss": 0.6581, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.4219097245554728, + "learning_rate": 0.00017257905669104874, + "loss": 0.73, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.38933215122078296, + "learning_rate": 0.00017246007319127545, + "loss": 0.6736, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.44852673008214267, + "learning_rate": 0.00017234087331472497, + "loss": 0.7977, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.5126751292572221, + "learning_rate": 0.00017222145741734626, + "loss": 0.8378, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.3800039625006193, + "learning_rate": 0.00017210182585573327, + "loss": 0.7026, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.429692197429523, + "learning_rate": 0.00017198197898712404, + "loss": 0.8146, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.3715916865545978, + "learning_rate": 0.00017186191716939944, + "loss": 0.697, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.4206690095658035, + "learning_rate": 0.0001717416407610824, + "loss": 0.7826, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4215409696210893, + "learning_rate": 0.00017162115012133643, + "loss": 0.7789, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.4327914622311286, + "learning_rate": 0.00017150044560996488, + "loss": 0.7741, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.41717016511194416, + "learning_rate": 0.00017137952758740978, + "loss": 0.7483, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.42355520239782923, + "learning_rate": 0.00017125839641475072, + "loss": 0.7476, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.3867276064443795, + "learning_rate": 0.00017113705245370368, + "loss": 0.7566, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.3877790135354421, + "learning_rate": 0.00017101549606662024, + "loss": 0.702, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.4277111086201661, + "learning_rate": 0.00017089372761648616, + "loss": 0.7433, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.3837665247016121, + "learning_rate": 0.00017077174746692056, + "loss": 0.7263, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.4131361729455296, + "learning_rate": 0.00017064955598217462, + "loss": 0.7716, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.39463662549594924, + "learning_rate": 0.00017052715352713075, + "loss": 0.704, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.46853489829372913, + "learning_rate": 0.00017040454046730115, + "loss": 0.7531, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.4275530686883492, + "learning_rate": 0.00017028171716882714, + "loss": 0.7422, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.44886232700057555, + "learning_rate": 0.00017015868399847768, + "loss": 0.8124, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.7722961059754266, + "learning_rate": 0.00017003544132364846, + "loss": 0.7464, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.4051253424216697, + "learning_rate": 0.00016991198951236088, + "loss": 0.7498, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.4070330323475715, + "learning_rate": 0.00016978832893326074, + "loss": 0.766, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.39832875640641235, + "learning_rate": 0.00016966445995561727, + "loss": 0.7308, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.42205323080764595, + "learning_rate": 0.00016954038294932216, + "loss": 0.7341, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.38204097319011965, + "learning_rate": 0.00016941609828488807, + "loss": 0.6862, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.3842199481630002, + "learning_rate": 0.0001692916063334479, + "loss": 0.7138, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.42338750483674525, + "learning_rate": 0.0001691669074667535, + "loss": 0.7055, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.3791350496852627, + "learning_rate": 0.0001690420020571747, + "loss": 0.6839, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.41809923777937463, + "learning_rate": 0.0001689168904776979, + "loss": 0.7568, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.41242984626947155, + "learning_rate": 0.00016879157310192535, + "loss": 0.6956, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4078573922896774, + "learning_rate": 0.0001686660503040737, + "loss": 0.7231, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.3967851891990228, + "learning_rate": 0.00016854032245897308, + "loss": 0.7102, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.3916542742680514, + "learning_rate": 0.00016841438994206595, + "loss": 0.7226, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.39236733974013766, + "learning_rate": 0.00016828825312940592, + "loss": 0.762, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.42339914460681044, + "learning_rate": 0.00016816191239765667, + "loss": 0.7814, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.42067552663350977, + "learning_rate": 0.00016803536812409075, + "loss": 0.7906, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3764735729388975, + "learning_rate": 0.0001679086206865886, + "loss": 0.7398, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.3834201056794837, + "learning_rate": 0.00016778167046363734, + "loss": 0.7034, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.37414714001659194, + "learning_rate": 0.00016765451783432953, + "loss": 0.7126, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4343418279165122, + "learning_rate": 0.00016752716317836229, + "loss": 0.7961, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.3624621540174937, + "learning_rate": 0.0001673996068760359, + "loss": 0.682, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.45200035616835804, + "learning_rate": 0.00016727184930825288, + "loss": 0.7728, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.4124796415417627, + "learning_rate": 0.0001671438908565167, + "loss": 0.7864, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.36882536374265185, + "learning_rate": 0.00016701573190293077, + "loss": 0.6878, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.40553927671667717, + "learning_rate": 0.00016688737283019706, + "loss": 0.7213, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.3934992140528808, + "learning_rate": 0.00016675881402161536, + "loss": 0.7619, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.397573986004267, + "learning_rate": 0.00016663005586108176, + "loss": 0.7675, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.40835174885439235, + "learning_rate": 0.00016650109873308765, + "loss": 0.7699, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.46876368099461874, + "learning_rate": 0.0001663719430227186, + "loss": 0.7934, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.49615378169211716, + "learning_rate": 0.0001662425891156531, + "loss": 0.7816, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.4053875411080947, + "learning_rate": 0.00016611303739816168, + "loss": 0.7561, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.42561219312927, + "learning_rate": 0.00016598328825710533, + "loss": 0.7132, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.40219316282167905, + "learning_rate": 0.00016585334207993476, + "loss": 0.7283, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.42082404021855246, + "learning_rate": 0.00016572319925468892, + "loss": 0.762, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4571926838964318, + "learning_rate": 0.000165592860169994, + "loss": 0.7537, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.4206596352880861, + "learning_rate": 0.0001654623252150624, + "loss": 0.7656, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.38463464009826004, + "learning_rate": 0.00016533159477969122, + "loss": 0.7162, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.44155313503274923, + "learning_rate": 0.00016520066925426144, + "loss": 0.8467, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.3611117779925444, + "learning_rate": 0.00016506954902973655, + "loss": 0.6999, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.3972195348207249, + "learning_rate": 0.00016493823449766136, + "loss": 0.7503, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3535015203682368, + "learning_rate": 0.0001648067260501611, + "loss": 0.6968, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.3699748586388991, + "learning_rate": 0.00016467502407993992, + "loss": 0.7448, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.4424967904763217, + "learning_rate": 0.0001645431289802799, + "loss": 0.791, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4498128558965614, + "learning_rate": 0.0001644110411450398, + "loss": 0.8088, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.43667457643021707, + "learning_rate": 0.00016427876096865394, + "loss": 0.766, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.44226026815601244, + "learning_rate": 0.00016414628884613107, + "loss": 0.7885, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3916530667440351, + "learning_rate": 0.00016401362517305296, + "loss": 0.6686, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.41850974804992097, + "learning_rate": 0.00016388077034557355, + "loss": 0.7376, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.41055031607605963, + "learning_rate": 0.00016374772476041748, + "loss": 0.769, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.46206511511307036, + "learning_rate": 0.00016361448881487914, + "loss": 0.797, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.4728340133986983, + "learning_rate": 0.00016348106290682118, + "loss": 0.7736, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.3730680447553586, + "learning_rate": 0.00016334744743467364, + "loss": 0.6992, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.4074647159116535, + "learning_rate": 0.00016321364279743266, + "loss": 0.701, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.4416522556853388, + "learning_rate": 0.00016307964939465914, + "loss": 0.767, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.42544120300347543, + "learning_rate": 0.00016294546762647775, + "loss": 0.7523, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.5021753292292243, + "learning_rate": 0.0001628110978935756, + "loss": 0.813, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.49752288780509973, + "learning_rate": 0.0001626765405972011, + "loss": 0.8251, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.43609273060819986, + "learning_rate": 0.00016254179613916278, + "loss": 0.7778, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3910594087749003, + "learning_rate": 0.00016240686492182804, + "loss": 0.7076, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.42399806873192075, + "learning_rate": 0.000162271747348122, + "loss": 0.76, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.4455481847186077, + "learning_rate": 0.0001621364438215262, + "loss": 0.7565, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.4159572957710151, + "learning_rate": 0.00016200095474607753, + "loss": 0.7618, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.3909709559237449, + "learning_rate": 0.00016186528052636692, + "loss": 0.7156, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.35451495148952406, + "learning_rate": 0.0001617294215675382, + "loss": 0.6832, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3848727942305188, + "learning_rate": 0.00016159337827528685, + "loss": 0.7181, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.43545144621771725, + "learning_rate": 0.0001614571510558588, + "loss": 0.7785, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.3995989663529192, + "learning_rate": 0.00016132074031604917, + "loss": 0.7385, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.495353342458206, + "learning_rate": 0.0001611841464632011, + "loss": 0.7671, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.47937900411962037, + "learning_rate": 0.00016104736990520468, + "loss": 0.8072, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.43175232387396334, + "learning_rate": 0.0001609104110504954, + "loss": 0.7469, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4025463144197025, + "learning_rate": 0.0001607732703080532, + "loss": 0.7408, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.4013799939839655, + "learning_rate": 0.00016063594808740113, + "loss": 0.7749, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.43345023453575765, + "learning_rate": 0.00016049844479860422, + "loss": 0.7829, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.4456430739648928, + "learning_rate": 0.00016036076085226814, + "loss": 0.7576, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.38188836656420205, + "learning_rate": 0.00016022289665953808, + "loss": 0.6475, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.4574232945912238, + "learning_rate": 0.00016008485263209742, + "loss": 0.735, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.39777968133725466, + "learning_rate": 0.0001599466291821666, + "loss": 0.7652, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.45612112090003415, + "learning_rate": 0.0001598082267225018, + "loss": 0.7095, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.4075773787749344, + "learning_rate": 0.0001596696456663938, + "loss": 0.7045, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.3862909785511598, + "learning_rate": 0.0001595308864276666, + "loss": 0.7453, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.399928635845262, + "learning_rate": 0.00015939194942067646, + "loss": 0.6982, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.41249320195892486, + "learning_rate": 0.0001592528350603103, + "loss": 0.7569, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.3911247474399028, + "learning_rate": 0.0001591135437619847, + "loss": 0.7288, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.4004810545842315, + "learning_rate": 0.00015897407594164467, + "loss": 0.726, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.3841028204352315, + "learning_rate": 0.00015883443201576225, + "loss": 0.7346, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.41410558984432494, + "learning_rate": 0.0001586946124013354, + "loss": 0.7583, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.4200283259363451, + "learning_rate": 0.00015855461751588677, + "loss": 0.7339, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.4206229025591576, + "learning_rate": 0.0001584144477774623, + "loss": 0.7344, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4156803145577367, + "learning_rate": 0.0001582741036046301, + "loss": 0.7744, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.4012540194977796, + "learning_rate": 0.00015813358541647915, + "loss": 0.7509, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.4085776502061583, + "learning_rate": 0.00015799289363261813, + "loss": 0.7551, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.39676717102139497, + "learning_rate": 0.00015785202867317407, + "loss": 0.7408, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.38378286227930847, + "learning_rate": 0.00015771099095879108, + "loss": 0.7295, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.421053547266049, + "learning_rate": 0.0001575697809106292, + "loss": 0.8521, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.46088047457782805, + "learning_rate": 0.00015742839895036305, + "loss": 0.8125, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.4276792719691738, + "learning_rate": 0.00015728684550018064, + "loss": 0.7634, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.3925885280285251, + "learning_rate": 0.0001571451209827821, + "loss": 0.6892, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.5430540476336514, + "learning_rate": 0.00015700322582137827, + "loss": 0.7434, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.3887821052046365, + "learning_rate": 0.00015686116043968972, + "loss": 0.7168, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.4079441568956553, + "learning_rate": 0.00015671892526194516, + "loss": 0.7344, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4346419986751891, + "learning_rate": 0.0001565765207128805, + "loss": 0.7316, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.43325646364544634, + "learning_rate": 0.0001564339472177373, + "loss": 0.7776, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.3984917555196218, + "learning_rate": 0.00015629120520226165, + "loss": 0.6962, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.4082803946841214, + "learning_rate": 0.0001561482950927029, + "loss": 0.7923, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.3641715820819403, + "learning_rate": 0.0001560052173158123, + "loss": 0.6643, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.3991432810815113, + "learning_rate": 0.00015586197229884184, + "loss": 0.7311, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3863300926249194, + "learning_rate": 0.00015571856046954285, + "loss": 0.74, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.4859436336249122, + "learning_rate": 0.00015557498225616487, + "loss": 0.7763, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.47830405925011366, + "learning_rate": 0.0001554312380874542, + "loss": 0.8074, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4657261158809216, + "learning_rate": 0.00015528732839265272, + "loss": 0.8706, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.36286031031171573, + "learning_rate": 0.00015514325360149668, + "loss": 0.673, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.45028701724880094, + "learning_rate": 0.0001549990141442153, + "loss": 0.7509, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.4201810736786452, + "learning_rate": 0.0001548546104515294, + "loss": 0.7447, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.4939152221740485, + "learning_rate": 0.00015471004295465035, + "loss": 0.7872, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.4066915162517629, + "learning_rate": 0.0001545653120852787, + "loss": 0.7218, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.48413692220193694, + "learning_rate": 0.00015442041827560274, + "loss": 0.7152, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.42940795553738864, + "learning_rate": 0.00015427536195829742, + "loss": 0.7457, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.4828547742134924, + "learning_rate": 0.00015413014356652286, + "loss": 0.825, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.40041206947874025, + "learning_rate": 0.00015398476353392323, + "loss": 0.6988, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.41485634840986896, + "learning_rate": 0.00015383922229462549, + "loss": 0.7078, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.4409892443135497, + "learning_rate": 0.00015369352028323774, + "loss": 0.8183, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.39035772942470676, + "learning_rate": 0.00015354765793484834, + "loss": 0.7061, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.40006171935545326, + "learning_rate": 0.0001534016356850244, + "loss": 0.6999, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.446260087804474, + "learning_rate": 0.0001532554539698105, + "loss": 0.8193, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.3958759602788678, + "learning_rate": 0.00015310911322572753, + "loss": 0.7817, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.45673771090859966, + "learning_rate": 0.00015296261388977108, + "loss": 0.7665, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.3998799624324905, + "learning_rate": 0.0001528159563994104, + "loss": 0.7017, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.47534550520921026, + "learning_rate": 0.000152669141192587, + "loss": 0.8133, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.38642056247769024, + "learning_rate": 0.00015252216870771345, + "loss": 0.7129, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.40304098042143616, + "learning_rate": 0.00015237503938367186, + "loss": 0.7441, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.382119132421216, + "learning_rate": 0.00015222775365981273, + "loss": 0.6694, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.3711789028725146, + "learning_rate": 0.00015208031197595356, + "loss": 0.67, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.4001171072797071, + "learning_rate": 0.0001519327147723776, + "loss": 0.7339, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4008137127909884, + "learning_rate": 0.00015178496248983254, + "loss": 0.7103, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.42932147112891744, + "learning_rate": 0.0001516370555695291, + "loss": 0.7508, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.443665893254457, + "learning_rate": 0.00015148899445313981, + "loss": 0.7586, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.41413894710051435, + "learning_rate": 0.00015134077958279765, + "loss": 0.7637, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.4414564870415473, + "learning_rate": 0.00015119241140109467, + "loss": 0.8341, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.40053786190114776, + "learning_rate": 0.00015104389035108077, + "loss": 0.7158, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.3964999448735445, + "learning_rate": 0.00015089521687626243, + "loss": 0.7802, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.38901853503797473, + "learning_rate": 0.0001507463914206012, + "loss": 0.6997, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.38298133405679025, + "learning_rate": 0.0001505974144285124, + "loss": 0.6957, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.39563189030321416, + "learning_rate": 0.000150448286344864, + "loss": 0.7326, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.4893833016542019, + "learning_rate": 0.00015029900761497506, + "loss": 0.827, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.4701387507607022, + "learning_rate": 0.00015014957868461458, + "loss": 0.8191, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.4121261824700458, + "learning_rate": 0.00015000000000000001, + "loss": 0.7452, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.4368116591376392, + "learning_rate": 0.000149850272007796, + "loss": 0.7436, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.4600104278464735, + "learning_rate": 0.00014970039515511304, + "loss": 0.761, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.41678022759529676, + "learning_rate": 0.00014955036988950618, + "loss": 0.8406, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.4335627895419274, + "learning_rate": 0.0001494001966589736, + "loss": 0.7462, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.3972841911355577, + "learning_rate": 0.00014924987591195547, + "loss": 0.7077, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.40045395994328453, + "learning_rate": 0.00014909940809733222, + "loss": 0.6897, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.4061394581635898, + "learning_rate": 0.0001489487936644237, + "loss": 0.7358, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.42942956602146515, + "learning_rate": 0.00014879803306298736, + "loss": 0.7192, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4075239157016998, + "learning_rate": 0.00014864712674321734, + "loss": 0.7562, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.3862678290075918, + "learning_rate": 0.00014849607515574276, + "loss": 0.7188, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.40313571654281727, + "learning_rate": 0.00014834487875162657, + "loss": 0.8012, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.4254581098635978, + "learning_rate": 0.00014819353798236427, + "loss": 0.6991, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.43773082579986694, + "learning_rate": 0.00014804205329988225, + "loss": 0.7523, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.3802979587947457, + "learning_rate": 0.00014789042515653687, + "loss": 0.6875, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3922326377194318, + "learning_rate": 0.00014773865400511272, + "loss": 0.7306, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.4136677784543476, + "learning_rate": 0.00014758674029882152, + "loss": 0.7304, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.3705776881560074, + "learning_rate": 0.00014743468449130063, + "loss": 0.6981, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4052160744898137, + "learning_rate": 0.00014728248703661182, + "loss": 0.7498, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.424671283876574, + "learning_rate": 0.00014713014838923976, + "loss": 0.6655, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.3648877686813302, + "learning_rate": 0.00014697766900409074, + "loss": 0.6902, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4557973911607531, + "learning_rate": 0.00014682504933649144, + "loss": 0.8138, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.38960918349115975, + "learning_rate": 0.0001466722898421873, + "loss": 0.6965, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.36731589024834815, + "learning_rate": 0.0001465193909773413, + "loss": 0.6657, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.3915171183405222, + "learning_rate": 0.00014636635319853275, + "loss": 0.7516, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.415460175616078, + "learning_rate": 0.00014621317696275564, + "loss": 0.7333, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.3800235477148507, + "learning_rate": 0.00014605986272741748, + "loss": 0.7432, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.4438067155328549, + "learning_rate": 0.00014590641095033787, + "loss": 0.7023, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.4208286667125372, + "learning_rate": 0.00014575282208974702, + "loss": 0.7524, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.40353530011730415, + "learning_rate": 0.00014559909660428468, + "loss": 0.688, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.4199313683077951, + "learning_rate": 0.00014544523495299842, + "loss": 0.7732, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.3691237934804446, + "learning_rate": 0.00014529123759534255, + "loss": 0.7028, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.40556832944039695, + "learning_rate": 0.00014513710499117647, + "loss": 0.7003, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5177498432332827, + "learning_rate": 0.0001449828376007636, + "loss": 0.8535, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.4793977878312778, + "learning_rate": 0.00014482843588476974, + "loss": 0.7669, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.36899268429416426, + "learning_rate": 0.00014467390030426186, + "loss": 0.6601, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.3932280743144855, + "learning_rate": 0.0001445192313207067, + "loss": 0.7379, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.36856723406712216, + "learning_rate": 0.0001443644293959693, + "loss": 0.6514, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.4495881605134991, + "learning_rate": 0.00014420949499231172, + "loss": 0.7473, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4710259029838652, + "learning_rate": 0.0001440544285723915, + "loss": 0.7786, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.36807343178593516, + "learning_rate": 0.00014389923059926062, + "loss": 0.6731, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.34658569975565173, + "learning_rate": 0.0001437439015363638, + "loss": 0.6782, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.40579788888054424, + "learning_rate": 0.00014358844184753712, + "loss": 0.6872, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.38918742202678164, + "learning_rate": 0.00014343285199700683, + "loss": 0.7208, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.3739971026467054, + "learning_rate": 0.0001432771324493879, + "loss": 0.6854, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.47901758944800493, + "learning_rate": 0.00014312128366968243, + "loss": 0.8333, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.4303849937384296, + "learning_rate": 0.00014296530612327863, + "loss": 0.7523, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.4374209556151092, + "learning_rate": 0.00014280920027594907, + "loss": 0.7678, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.47198809216605603, + "learning_rate": 0.00014265296659384956, + "loss": 0.8356, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.4024862399022665, + "learning_rate": 0.00014249660554351752, + "loss": 0.7007, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.3792924443405352, + "learning_rate": 0.00014234011759187083, + "loss": 0.7153, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.430053974658381, + "learning_rate": 0.00014218350320620624, + "loss": 0.7302, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.4012972328411576, + "learning_rate": 0.00014202676285419812, + "loss": 0.774, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.39699382652413323, + "learning_rate": 0.00014186989700389687, + "loss": 0.7181, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.3719421065089118, + "learning_rate": 0.0001417129061237278, + "loss": 0.6906, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.3887516144237888, + "learning_rate": 0.0001415557906824895, + "loss": 0.6608, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.3895210635046898, + "learning_rate": 0.00014139855114935252, + "loss": 0.7179, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.45450082390451063, + "learning_rate": 0.00014124118799385796, + "loss": 0.8367, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.3725659355692262, + "learning_rate": 0.0001410837016859161, + "loss": 0.6959, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.35309941086865837, + "learning_rate": 0.00014092609269580496, + "loss": 0.6689, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.3963476275709724, + "learning_rate": 0.00014076836149416887, + "loss": 0.6858, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.4528365425048492, + "learning_rate": 0.00014061050855201723, + "loss": 0.7301, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.3795630731390314, + "learning_rate": 0.0001404525343407228, + "loss": 0.6641, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4496395597025512, + "learning_rate": 0.0001402944393320206, + "loss": 0.7653, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.4691699332043802, + "learning_rate": 0.00014013622399800627, + "loss": 0.7242, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.4069917374756741, + "learning_rate": 0.00013997788881113489, + "loss": 0.7501, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4323151852428113, + "learning_rate": 0.00013981943424421932, + "loss": 0.7484, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.3829524315730722, + "learning_rate": 0.0001396608607704289, + "loss": 0.7703, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.479005062678588, + "learning_rate": 0.0001395021688632882, + "loss": 0.8218, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3976082145679388, + "learning_rate": 0.00013934335899667527, + "loss": 0.7109, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.376953678622604, + "learning_rate": 0.00013918443164482046, + "loss": 0.6589, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.41759204644824766, + "learning_rate": 0.000139025387282305, + "loss": 0.751, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.44479204319411103, + "learning_rate": 0.00013886622638405952, + "loss": 0.773, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.40128061601500625, + "learning_rate": 0.0001387069494253626, + "loss": 0.7418, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.43936434787238515, + "learning_rate": 0.0001385475568818394, + "loss": 0.7049, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.46942071117925305, + "learning_rate": 0.00013838804922946027, + "loss": 0.8682, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.3505449819227317, + "learning_rate": 0.00013822842694453924, + "loss": 0.6281, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.3886842803955136, + "learning_rate": 0.0001380686905037327, + "loss": 0.6781, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4250225771278342, + "learning_rate": 0.00013790884038403795, + "loss": 0.7187, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.38015548304077507, + "learning_rate": 0.00013774887706279165, + "loss": 0.7301, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.4090302692938638, + "learning_rate": 0.0001375888010176686, + "loss": 0.7224, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.41420052771101323, + "learning_rate": 0.00013742861272668012, + "loss": 0.7579, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.3865481278497134, + "learning_rate": 0.00013726831266817278, + "loss": 0.7213, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.3711153136441077, + "learning_rate": 0.00013710790132082692, + "loss": 0.6416, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.3994148069993138, + "learning_rate": 0.00013694737916365517, + "loss": 0.708, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.43846752150037704, + "learning_rate": 0.00013678674667600102, + "loss": 0.6679, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.3943227265598233, + "learning_rate": 0.00013662600433753745, + "loss": 0.6761, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.4690445251675281, + "learning_rate": 0.00013646515262826552, + "loss": 0.7539, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.38284744944500254, + "learning_rate": 0.00013630419202851284, + "loss": 0.6997, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.39521289749916383, + "learning_rate": 0.00013614312301893223, + "loss": 0.6664, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.48568882811090963, + "learning_rate": 0.0001359819460805001, + "loss": 0.7892, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.44396111349935563, + "learning_rate": 0.00013582066169451535, + "loss": 0.8052, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.3890595348444515, + "learning_rate": 0.0001356592703425976, + "loss": 0.6517, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.417056206989733, + "learning_rate": 0.0001354977725066859, + "loss": 0.7816, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.37572599477212015, + "learning_rate": 0.00013533616866903735, + "loss": 0.6736, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.43568748018498377, + "learning_rate": 0.0001351744593122255, + "loss": 0.7693, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4053396712735877, + "learning_rate": 0.00013501264491913906, + "loss": 0.6891, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.3575046459966235, + "learning_rate": 0.00013485072597298038, + "loss": 0.6732, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.40571273371726574, + "learning_rate": 0.00013468870295726398, + "loss": 0.7085, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3756296277014389, + "learning_rate": 0.0001345265763558152, + "loss": 0.6825, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.37585736144686316, + "learning_rate": 0.00013436434665276865, + "loss": 0.6686, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.47134961235009076, + "learning_rate": 0.00013420201433256689, + "loss": 0.8003, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.4229574470837783, + "learning_rate": 0.00013403957987995882, + "loss": 0.7446, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.41021182123563316, + "learning_rate": 0.00013387704377999842, + "loss": 0.7067, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.43462186873872444, + "learning_rate": 0.00013371440651804313, + "loss": 0.7266, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.39502082983477904, + "learning_rate": 0.0001335516685797525, + "loss": 0.7415, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.41845170832603323, + "learning_rate": 0.00013338883045108674, + "loss": 0.7058, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.38417517485198227, + "learning_rate": 0.00013322589261830517, + "loss": 0.7297, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.46865854857087563, + "learning_rate": 0.00013306285556796495, + "loss": 0.7911, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.432714007982488, + "learning_rate": 0.0001328997197869194, + "loss": 0.7296, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.4130104753805327, + "learning_rate": 0.0001327364857623168, + "loss": 0.7109, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3965384097121275, + "learning_rate": 0.00013257315398159864, + "loss": 0.712, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.39753345856911626, + "learning_rate": 0.00013240972493249847, + "loss": 0.7294, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.37337568391982695, + "learning_rate": 0.0001322461991030402, + "loss": 0.6801, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.42495863013532975, + "learning_rate": 0.00013208257698153677, + "loss": 0.7795, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.3694816298883662, + "learning_rate": 0.00013191885905658872, + "loss": 0.7609, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.422197920472493, + "learning_rate": 0.0001317550458170826, + "loss": 0.727, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.42133160117778606, + "learning_rate": 0.00013159113775218964, + "loss": 0.74, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.40522540034554205, + "learning_rate": 0.00013142713535136414, + "loss": 0.7057, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.4257453396923358, + "learning_rate": 0.00013126303910434214, + "loss": 0.7536, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.45821836303267105, + "learning_rate": 0.00013109884950114007, + "loss": 0.7428, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.40996940512801966, + "learning_rate": 0.00013093456703205288, + "loss": 0.7069, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.4136682843634082, + "learning_rate": 0.00013077019218765305, + "loss": 0.7174, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4330694095649913, + "learning_rate": 0.00013060572545878875, + "loss": 0.7558, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.38072139633099955, + "learning_rate": 0.0001304411673365826, + "loss": 0.7485, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.44681542398150365, + "learning_rate": 0.0001302765183124302, + "loss": 0.752, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.4835967639933307, + "learning_rate": 0.00013011177887799845, + "loss": 0.7112, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.4337798575706466, + "learning_rate": 0.00012994694952522435, + "loss": 0.7119, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.4394053368246134, + "learning_rate": 0.00012978203074631334, + "loss": 0.7606, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4184670488382918, + "learning_rate": 0.00012961702303373795, + "loss": 0.7899, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.4486236155519449, + "learning_rate": 0.00012945192688023624, + "loss": 0.6806, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.37217166593542145, + "learning_rate": 0.0001292867427788104, + "loss": 0.6633, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.4056099913698769, + "learning_rate": 0.00012912147122272523, + "loss": 0.6699, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.3748965966249936, + "learning_rate": 0.00012895611270550666, + "loss": 0.6827, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.3682053282598978, + "learning_rate": 0.0001287906677209403, + "loss": 0.6746, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.4648781783236153, + "learning_rate": 0.00012862513676307008, + "loss": 0.7523, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.3927181598506785, + "learning_rate": 0.0001284595203261965, + "loss": 0.6646, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.4302304548474638, + "learning_rate": 0.00012829381890487536, + "loss": 0.7261, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.4037203306983247, + "learning_rate": 0.00012812803299391628, + "loss": 0.717, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.45973219805149657, + "learning_rate": 0.00012796216308838117, + "loss": 0.7974, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.4156731658562213, + "learning_rate": 0.00012779620968358273, + "loss": 0.6816, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.38686160742998493, + "learning_rate": 0.00012763017327508305, + "loss": 0.6187, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.38541876275141473, + "learning_rate": 0.00012746405435869198, + "loss": 0.731, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.3650985882425949, + "learning_rate": 0.00012729785343046588, + "loss": 0.6375, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.3820816035240978, + "learning_rate": 0.0001271315709867059, + "loss": 0.6481, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.4098100896890776, + "learning_rate": 0.00012696520752395672, + "loss": 0.7088, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.4019992067110189, + "learning_rate": 0.00012679876353900482, + "loss": 0.6383, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.4477475970231683, + "learning_rate": 0.00012663223952887723, + "loss": 0.7161, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.4832723278823965, + "learning_rate": 0.00012646563599083996, + "loss": 0.8073, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.39144528129308914, + "learning_rate": 0.00012629895342239643, + "loss": 0.7019, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.48014585161105533, + "learning_rate": 0.00012613219232128608, + "loss": 0.7228, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.38467131682164596, + "learning_rate": 0.00012596535318548289, + "loss": 0.6221, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.4530745317538008, + "learning_rate": 0.0001257984365131938, + "loss": 0.718, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4082702248795799, + "learning_rate": 0.00012563144280285741, + "loss": 0.6797, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.43342089139276424, + "learning_rate": 0.00012546437255314222, + "loss": 0.6645, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.4207451429602256, + "learning_rate": 0.0001252972262629454, + "loss": 0.6833, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.38360984855426133, + "learning_rate": 0.00012513000443139112, + "loss": 0.6894, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.4240341291920665, + "learning_rate": 0.00012496270755782914, + "loss": 0.6722, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.4305886105350733, + "learning_rate": 0.00012479533614183334, + "loss": 0.7681, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3823607516679746, + "learning_rate": 0.00012462789068320017, + "loss": 0.6938, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.3729241120465026, + "learning_rate": 0.00012446037168194714, + "loss": 0.6707, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.4818185548709059, + "learning_rate": 0.00012429277963831148, + "loss": 0.7729, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.3839065434027884, + "learning_rate": 0.00012412511505274844, + "loss": 0.6874, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.6072014627863225, + "learning_rate": 0.00012395737842592995, + "loss": 0.6463, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.45298407699820475, + "learning_rate": 0.000123789570258743, + "loss": 0.795, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3768745247623755, + "learning_rate": 0.00012362169105228826, + "loss": 0.6621, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.46702341552804794, + "learning_rate": 0.00012345374130787854, + "loss": 0.759, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.37489083732690975, + "learning_rate": 0.00012328572152703725, + "loss": 0.6834, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.395287838499955, + "learning_rate": 0.000123117632211497, + "loss": 0.6993, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.4268120690808298, + "learning_rate": 0.00012294947386319794, + "loss": 0.7052, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.42529780825975927, + "learning_rate": 0.0001227812469842864, + "loss": 0.7153, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4770704866468159, + "learning_rate": 0.00012261295207711346, + "loss": 0.8014, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.41073002696832, + "learning_rate": 0.00012244458964423327, + "loss": 0.6851, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.42479891566442424, + "learning_rate": 0.00012227616018840154, + "loss": 0.7233, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.43012358618229257, + "learning_rate": 0.0001221076642125742, + "loss": 0.7238, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.427970768184565, + "learning_rate": 0.00012193910221990581, + "loss": 0.7545, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.44220901200305707, + "learning_rate": 0.00012177047471374807, + "loss": 0.7454, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.3708598571062073, + "learning_rate": 0.00012160178219764837, + "loss": 0.6875, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.38154112367893656, + "learning_rate": 0.0001214330251753481, + "loss": 0.655, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.4180924914745405, + "learning_rate": 0.00012126420415078132, + "loss": 0.7026, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.37706281735537867, + "learning_rate": 0.00012109531962807332, + "loss": 0.6642, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.45162402306065524, + "learning_rate": 0.00012092637211153885, + "loss": 0.702, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.38926688752106436, + "learning_rate": 0.0001207573621056809, + "loss": 0.6694, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4112584316278816, + "learning_rate": 0.00012058829011518896, + "loss": 0.7053, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.40729998026649766, + "learning_rate": 0.00012041915664493761, + "loss": 0.7366, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.47521057991822163, + "learning_rate": 0.00012024996219998517, + "loss": 0.7819, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3968728998280063, + "learning_rate": 0.00012008070728557186, + "loss": 0.7633, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.39435414186145457, + "learning_rate": 0.00011991139240711857, + "loss": 0.6521, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.38015894341698714, + "learning_rate": 0.00011974201807022525, + "loss": 0.6816, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.40211203961776676, + "learning_rate": 0.00011957258478066931, + "loss": 0.732, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.3687423287361794, + "learning_rate": 0.00011940309304440433, + "loss": 0.6605, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.3608434524524818, + "learning_rate": 0.00011923354336755835, + "loss": 0.6756, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.3922623609380738, + "learning_rate": 0.00011906393625643244, + "loss": 0.6725, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.39597347062270916, + "learning_rate": 0.00011889427221749916, + "loss": 0.7483, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.34586685440181364, + "learning_rate": 0.00011872455175740112, + "loss": 0.6412, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.44501496790223405, + "learning_rate": 0.00011855477538294935, + "loss": 0.673, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.4600204345998243, + "learning_rate": 0.00011838494360112185, + "loss": 0.7201, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.4014196509721014, + "learning_rate": 0.00011821505691906216, + "loss": 0.699, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.3865882147784291, + "learning_rate": 0.00011804511584407763, + "loss": 0.6704, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.42434731199748743, + "learning_rate": 0.00011787512088363817, + "loss": 0.7314, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.3669948305054552, + "learning_rate": 0.00011770507254537453, + "loss": 0.6729, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4080468927162067, + "learning_rate": 0.00011753497133707679, + "loss": 0.7209, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.40052575061760193, + "learning_rate": 0.00011736481776669306, + "loss": 0.7068, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.38516728199904116, + "learning_rate": 0.00011719461234232764, + "loss": 0.6365, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4642683269504646, + "learning_rate": 0.00011702435557223987, + "loss": 0.7884, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.42914379555245596, + "learning_rate": 0.00011685404796484225, + "loss": 0.7144, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.39355197006503395, + "learning_rate": 0.00011668369002869912, + "loss": 0.6593, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.41403830664421076, + "learning_rate": 0.00011651328227252517, + "loss": 0.7261, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.39302743911168797, + "learning_rate": 0.00011634282520518383, + "loss": 0.7442, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.3799207034605691, + "learning_rate": 0.00011617231933568578, + "loss": 0.6613, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4190638514179366, + "learning_rate": 0.00011600176517318741, + "loss": 0.7603, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.37643098596186125, + "learning_rate": 0.00011583116322698935, + "loss": 0.6847, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.39834273988122226, + "learning_rate": 0.00011566051400653486, + "loss": 0.7247, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3853714718645344, + "learning_rate": 0.00011548981802140848, + "loss": 0.6914, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.3853852562389037, + "learning_rate": 0.00011531907578133429, + "loss": 0.685, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.40483027574830416, + "learning_rate": 0.00011514828779617459, + "loss": 0.6772, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.43773822287020947, + "learning_rate": 0.00011497745457592816, + "loss": 0.7151, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.401585884405608, + "learning_rate": 0.00011480657663072896, + "loss": 0.713, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.373259268876888, + "learning_rate": 0.00011463565447084445, + "loss": 0.6618, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.39481180820237505, + "learning_rate": 0.00011446468860667421, + "loss": 0.6679, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.3648008060062637, + "learning_rate": 0.00011429367954874819, + "loss": 0.6838, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.35357619534888196, + "learning_rate": 0.0001141226278077254, + "loss": 0.6673, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.43376610744122884, + "learning_rate": 0.00011395153389439233, + "loss": 0.7405, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.34837970865201134, + "learning_rate": 0.00011378039831966134, + "loss": 0.6533, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.4208128648663218, + "learning_rate": 0.00011360922159456928, + "loss": 0.7388, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.388716968023213, + "learning_rate": 0.00011343800423027582, + "loss": 0.709, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.5021201733267082, + "learning_rate": 0.00011326674673806195, + "loss": 0.8396, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.36603509229198133, + "learning_rate": 0.00011309544962932862, + "loss": 0.6432, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3653170131859906, + "learning_rate": 0.0001129241134155949, + "loss": 0.656, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.47370322209378996, + "learning_rate": 0.00011275273860849684, + "loss": 0.8309, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.445726497230847, + "learning_rate": 0.00011258132571978555, + "loss": 0.7042, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.38058288768635273, + "learning_rate": 0.00011240987526132594, + "loss": 0.6078, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.4219977727956937, + "learning_rate": 0.00011223838774509514, + "loss": 0.7854, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.4003805066609488, + "learning_rate": 0.00011206686368318086, + "loss": 0.6933, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.3869224368581654, + "learning_rate": 0.00011189530358778005, + "loss": 0.7227, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.3969519981251819, + "learning_rate": 0.00011172370797119712, + "loss": 0.6594, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.43243940086344307, + "learning_rate": 0.00011155207734584263, + "loss": 0.707, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.40973836972076577, + "learning_rate": 0.00011138041222423177, + "loss": 0.7206, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.4333513928815028, + "learning_rate": 0.00011120871311898254, + "loss": 0.7369, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.3778470478153257, + "learning_rate": 0.0001110369805428146, + "loss": 0.6896, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3787175393176003, + "learning_rate": 0.00011086521500854745, + "loss": 0.706, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.419055853729895, + "learning_rate": 0.0001106934170290991, + "loss": 0.7251, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.3939707033337408, + "learning_rate": 0.00011052158711748434, + "loss": 0.7261, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.34817981691445615, + "learning_rate": 0.00011034972578681338, + "loss": 0.6236, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.39871326971289006, + "learning_rate": 0.00011017783355029026, + "loss": 0.7646, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.3994819769347582, + "learning_rate": 0.00011000591092121127, + "loss": 0.6639, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.40472772943920193, + "learning_rate": 0.00010983395841296348, + "loss": 0.6971, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.4535036946152579, + "learning_rate": 0.0001096619765390232, + "loss": 0.7437, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.45382853786824645, + "learning_rate": 0.00010948996581295436, + "loss": 0.731, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4099178543711851, + "learning_rate": 0.00010931792674840718, + "loss": 0.6624, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.4351417287032228, + "learning_rate": 0.00010914585985911632, + "loss": 0.7108, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.3865999238995926, + "learning_rate": 0.00010897376565889971, + "loss": 0.686, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.4131282123736442, + "learning_rate": 0.00010880164466165674, + "loss": 0.7737, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.6147139842631306, + "learning_rate": 0.00010862949738136681, + "loss": 0.6286, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.43222057493158733, + "learning_rate": 0.00010845732433208779, + "loss": 0.7238, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3656758056743063, + "learning_rate": 0.00010828512602795462, + "loss": 0.6355, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.3979783969248524, + "learning_rate": 0.00010811290298317755, + "loss": 0.7204, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.38982035334648496, + "learning_rate": 0.00010794065571204072, + "loss": 0.6974, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.43538029830173397, + "learning_rate": 0.00010776838472890065, + "loss": 0.6914, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.4253887801936794, + "learning_rate": 0.00010759609054818458, + "loss": 0.7152, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.39675412191254944, + "learning_rate": 0.00010742377368438914, + "loss": 0.6935, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.42791248982595853, + "learning_rate": 0.00010725143465207867, + "loss": 0.6907, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.4154154123016647, + "learning_rate": 0.00010707907396588361, + "loss": 0.7278, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.40598956985927526, + "learning_rate": 0.0001069066921404992, + "loss": 0.6992, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3994948473070552, + "learning_rate": 0.00010673428969068364, + "loss": 0.7263, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.40504268232009794, + "learning_rate": 0.00010656186713125689, + "loss": 0.7408, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.4029635829319738, + "learning_rate": 0.0001063894249770989, + "loss": 0.7085, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.4091891981199755, + "learning_rate": 0.00010621696374314807, + "loss": 0.6426, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.364839486764378, + "learning_rate": 0.00010604448394439983, + "loss": 0.5997, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.4297159365033185, + "learning_rate": 0.00010587198609590505, + "loss": 0.703, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3637943546123168, + "learning_rate": 0.00010569947071276847, + "loss": 0.6128, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.3988480910800117, + "learning_rate": 0.00010552693831014726, + "loss": 0.6687, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.4161199702634679, + "learning_rate": 0.0001053543894032493, + "loss": 0.7151, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.36805818507906174, + "learning_rate": 0.00010518182450733186, + "loss": 0.68, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.4009215069937478, + "learning_rate": 0.00010500924413769988, + "loss": 0.6919, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.42282417996551025, + "learning_rate": 0.00010483664880970457, + "loss": 0.7095, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.43058054375470933, + "learning_rate": 0.00010466403903874176, + "loss": 0.7035, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.4217570417848946, + "learning_rate": 0.00010449141534025045, + "loss": 0.7146, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.42027817301221454, + "learning_rate": 0.00010431877822971117, + "loss": 0.7376, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.33834147707907003, + "learning_rate": 0.00010414612822264455, + "loss": 0.6487, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.4134821879558162, + "learning_rate": 0.00010397346583460971, + "loss": 0.7886, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.3949731948776204, + "learning_rate": 0.0001038007915812028, + "loss": 0.7012, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.34576332442971586, + "learning_rate": 0.00010362810597805526, + "loss": 0.6382, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.35388171583496386, + "learning_rate": 0.0001034554095408326, + "loss": 0.6219, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.38278455453057586, + "learning_rate": 0.00010328270278523256, + "loss": 0.7315, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4253906902405411, + "learning_rate": 0.0001031099862269837, + "loss": 0.7071, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.39983972561935216, + "learning_rate": 0.00010293726038184393, + "loss": 0.6921, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.4908593442118835, + "learning_rate": 0.00010276452576559879, + "loss": 0.7144, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.41244314476852084, + "learning_rate": 0.00010259178289406011, + "loss": 0.7213, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.3768081492396289, + "learning_rate": 0.00010241903228306431, + "loss": 0.6784, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.4091084323886437, + "learning_rate": 0.0001022462744484709, + "loss": 0.7362, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.39542854824294177, + "learning_rate": 0.00010207350990616107, + "loss": 0.6316, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.42658853432054694, + "learning_rate": 0.00010190073917203589, + "loss": 0.7116, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.4415072977745614, + "learning_rate": 0.00010172796276201503, + "loss": 0.7296, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3940372246149541, + "learning_rate": 0.0001015551811920351, + "loss": 0.6972, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.40714611050590943, + "learning_rate": 0.00010138239497804804, + "loss": 0.7074, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.3947412761805426, + "learning_rate": 0.00010120960463601976, + "loss": 0.6611, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.40630643296196406, + "learning_rate": 0.00010103681068192845, + "loss": 0.6494, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.37606127365186426, + "learning_rate": 0.00010086401363176305, + "loss": 0.6793, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.4225471917863874, + "learning_rate": 0.00010069121400152181, + "loss": 0.6946, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.3900813851774753, + "learning_rate": 0.00010051841230721065, + "loss": 0.6834, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.3723727654743894, + "learning_rate": 0.0001003456090648416, + "loss": 0.6645, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.3761631915368695, + "learning_rate": 0.00010017280479043147, + "loss": 0.693, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3586208818733987, + "learning_rate": 0.0001, + "loss": 0.7007, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.407637951605177, + "learning_rate": 9.982719520956855e-05, + "loss": 0.702, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.4059428046057281, + "learning_rate": 9.965439093515841e-05, + "loss": 0.732, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3850818151480131, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6823, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.4354398472055768, + "learning_rate": 9.930878599847821e-05, + "loss": 0.7214, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.395903093538153, + "learning_rate": 9.913598636823693e-05, + "loss": 0.6995, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.41586008261746277, + "learning_rate": 9.896318931807155e-05, + "loss": 0.7089, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.385195194061558, + "learning_rate": 9.879039536398024e-05, + "loss": 0.6803, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.4331270399274859, + "learning_rate": 9.861760502195197e-05, + "loss": 0.6991, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.42700022596637477, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7252, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.36384908021036255, + "learning_rate": 9.827203723798498e-05, + "loss": 0.6223, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.39288151581608477, + "learning_rate": 9.809926082796415e-05, + "loss": 0.6317, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.39847243013391914, + "learning_rate": 9.792649009383899e-05, + "loss": 0.7035, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.4039212053709652, + "learning_rate": 9.775372555152912e-05, + "loss": 0.6943, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.42290336828658043, + "learning_rate": 9.758096771693573e-05, + "loss": 0.7361, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.40170444050876447, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6502, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.37503291600085475, + "learning_rate": 9.723547423440122e-05, + "loss": 0.6663, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.4491079520111571, + "learning_rate": 9.70627396181561e-05, + "loss": 0.7287, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.37615029576785664, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7293, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.4062311879287013, + "learning_rate": 9.671729721476746e-05, + "loss": 0.7597, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.4106555572244141, + "learning_rate": 9.654459045916743e-05, + "loss": 0.6898, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.3810138927623543, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6712, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.43511644718936276, + "learning_rate": 9.619920841879725e-05, + "loss": 0.6573, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.3596408153487559, + "learning_rate": 9.602653416539031e-05, + "loss": 0.6547, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.4227460775317761, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7407, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.41660743948710655, + "learning_rate": 9.568122177028884e-05, + "loss": 0.7037, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.42203230232212147, + "learning_rate": 9.550858465974958e-05, + "loss": 0.7697, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.40976606484533545, + "learning_rate": 9.533596096125825e-05, + "loss": 0.705, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.35721645630057663, + "learning_rate": 9.516335119029546e-05, + "loss": 0.6238, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.414618087560364, + "learning_rate": 9.499075586230013e-05, + "loss": 0.7066, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3824425093661902, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6245, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.3901463888480024, + "learning_rate": 9.464561059675073e-05, + "loss": 0.7366, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.3858907265428058, + "learning_rate": 9.44730616898528e-05, + "loss": 0.7096, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.37916375453449047, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6696, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.3903566812994692, + "learning_rate": 9.412801390409497e-05, + "loss": 0.6568, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.3958293696206784, + "learning_rate": 9.395551605560018e-05, + "loss": 0.7225, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3932641529729933, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6274, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.38899566090338883, + "learning_rate": 9.361057502290113e-05, + "loss": 0.6884, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.4112888261967654, + "learning_rate": 9.343813286874312e-05, + "loss": 0.7223, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.43383671370061827, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7063, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.40397753060597436, + "learning_rate": 9.309330785950086e-05, + "loss": 0.7188, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.42631618450905356, + "learning_rate": 9.292092603411641e-05, + "loss": 0.7059, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.43939400703942777, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7835, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.4764160775924318, + "learning_rate": 9.257622631561085e-05, + "loss": 0.7543, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.41294112008589423, + "learning_rate": 9.240390945181543e-05, + "loss": 0.7348, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.38480351527250445, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6566, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.4208846692026214, + "learning_rate": 9.205934428795929e-05, + "loss": 0.7269, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.39984952875800545, + "learning_rate": 9.188709701682247e-05, + "loss": 0.6576, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.450115400163912, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7546, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.4384828064806114, + "learning_rate": 9.154267566791223e-05, + "loss": 0.6777, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.3537485819664656, + "learning_rate": 9.137050261863324e-05, + "loss": 0.6394, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3925664477015748, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6461, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.4042824405410308, + "learning_rate": 9.102623434110028e-05, + "loss": 0.7461, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.4150795731005183, + "learning_rate": 9.085414014088369e-05, + "loss": 0.7075, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.4263003552600655, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7252, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.37707965868426524, + "learning_rate": 9.051003418704565e-05, + "loss": 0.6708, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.3698700436596066, + "learning_rate": 9.033802346097682e-05, + "loss": 0.6426, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.4129200764803728, + "learning_rate": 9.016604158703654e-05, + "loss": 0.709, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.4083963175462401, + "learning_rate": 8.999408907878877e-05, + "loss": 0.6855, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.4663104499736953, + "learning_rate": 8.982216644970979e-05, + "loss": 0.811, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4297412267341237, + "learning_rate": 8.965027421318665e-05, + "loss": 0.7313, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.3358616111146125, + "learning_rate": 8.947841288251568e-05, + "loss": 0.6229, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.3515527834892537, + "learning_rate": 8.930658297090091e-05, + "loss": 0.6222, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.41901629544799746, + "learning_rate": 8.913478499145254e-05, + "loss": 0.7907, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.3990309806229665, + "learning_rate": 8.896301945718541e-05, + "loss": 0.6926, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.40509861786141166, + "learning_rate": 8.879128688101749e-05, + "loss": 0.6226, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.38529132852394044, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6713, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.41600132868777884, + "learning_rate": 8.844792265415738e-05, + "loss": 0.7254, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.3787373054795975, + "learning_rate": 8.827629202880293e-05, + "loss": 0.6199, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.42480829465729075, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6591, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.4017399826137374, + "learning_rate": 8.793313631681915e-05, + "loss": 0.687, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.3735955162878494, + "learning_rate": 8.776161225490489e-05, + "loss": 0.6351, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.37236289910720743, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6406, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.3700742805962829, + "learning_rate": 8.741867428021446e-05, + "loss": 0.6314, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.38741742446437183, + "learning_rate": 8.724726139150318e-05, + "loss": 0.6481, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.42061139016224, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6808, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.40189289414047447, + "learning_rate": 8.690455037067141e-05, + "loss": 0.731, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.43194099617650467, + "learning_rate": 8.673325326193806e-05, + "loss": 0.7576, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.40434161731585816, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6961, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.33576949025640107, + "learning_rate": 8.639077840543077e-05, + "loss": 0.6091, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.4209921060098242, + "learning_rate": 8.621960168033867e-05, + "loss": 0.686, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.40776023676536, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6471, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.44231962281175785, + "learning_rate": 8.587737219227462e-05, + "loss": 0.7029, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.4627369363006571, + "learning_rate": 8.570632045125185e-05, + "loss": 0.7135, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.4126007280129486, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6872, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.40118740732652464, + "learning_rate": 8.536434552915556e-05, + "loss": 0.7161, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.40074343327936646, + "learning_rate": 8.519342336927105e-05, + "loss": 0.6029, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.4900151191949164, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7302, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.3978294212795194, + "learning_rate": 8.485171220382545e-05, + "loss": 0.673, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.42979827939326554, + "learning_rate": 8.468092421866573e-05, + "loss": 0.674, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.37596990999365637, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6423, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.330748612387618, + "learning_rate": 8.433948599346516e-05, + "loss": 0.6101, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.38004071496037267, + "learning_rate": 8.416883677301069e-05, + "loss": 0.6522, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3958252004411559, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6519, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.4610886404839161, + "learning_rate": 8.382768066431425e-05, + "loss": 0.7679, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.43999354538623453, + "learning_rate": 8.36571747948162e-05, + "loss": 0.7649, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.431954569120567, + "learning_rate": 8.348671772747487e-05, + "loss": 0.7746, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.41751123735713236, + "learning_rate": 8.33163099713009e-05, + "loss": 0.6857, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.3639205433366738, + "learning_rate": 8.31459520351578e-05, + "loss": 0.5999, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.41782076174871, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7019, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.40137239964322896, + "learning_rate": 8.280538765767235e-05, + "loss": 0.6499, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.3916657116648281, + "learning_rate": 8.263518223330697e-05, + "loss": 0.6731, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.43648214272950975, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6936, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.3614702550180529, + "learning_rate": 8.22949274546255e-05, + "loss": 0.6604, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.39171255560881335, + "learning_rate": 8.212487911636184e-05, + "loss": 0.6435, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.41475724678973946, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6484, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.37044620468466394, + "learning_rate": 8.178494308093789e-05, + "loss": 0.6953, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.4104652708106284, + "learning_rate": 8.161505639887817e-05, + "loss": 0.6964, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.40310359248969063, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6186, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.4167507617064418, + "learning_rate": 8.127544824259889e-05, + "loss": 0.6916, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.4036111480437301, + "learning_rate": 8.110572778250085e-05, + "loss": 0.6987, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.42691669543925936, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7399, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.4896969373431105, + "learning_rate": 8.076645663244168e-05, + "loss": 0.7086, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.4256350569942081, + "learning_rate": 8.059690695559568e-05, + "loss": 0.6358, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.44296581475694147, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7005, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.41529459953318565, + "learning_rate": 8.025798192977481e-05, + "loss": 0.6933, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.3958102091557811, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6498, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.39296075905514277, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6627, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.36343079853159505, + "learning_rate": 7.975003780001485e-05, + "loss": 0.634, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.4121630028521987, + "learning_rate": 7.958084335506239e-05, + "loss": 0.6705, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4707733225954869, + "learning_rate": 7.941170988481108e-05, + "loss": 0.7134, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.5458950698848051, + "learning_rate": 7.924263789431912e-05, + "loss": 0.7358, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.4056371907864066, + "learning_rate": 7.907362788846116e-05, + "loss": 0.6493, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.3391220300166163, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6239, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.37303526946535215, + "learning_rate": 7.873579584921869e-05, + "loss": 0.6727, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.3975563747867086, + "learning_rate": 7.856697482465196e-05, + "loss": 0.716, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.40754193597631116, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6645, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.338121630211166, + "learning_rate": 7.822952528625191e-05, + "loss": 0.6345, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.4025820479162939, + "learning_rate": 7.806089778009421e-05, + "loss": 0.6989, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.3580610072994493, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6521, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.4119649831750368, + "learning_rate": 7.772383981159849e-05, + "loss": 0.6839, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.4024807176089574, + "learning_rate": 7.755541035576677e-05, + "loss": 0.6858, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4029083821244428, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6943, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.37969668653973554, + "learning_rate": 7.721875301571359e-05, + "loss": 0.6509, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.42182782146596565, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6985, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.5037731368237888, + "learning_rate": 7.688236778850306e-05, + "loss": 0.7754, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.37337809091949264, + "learning_rate": 7.671427847296275e-05, + "loss": 0.6729, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.37837334845546255, + "learning_rate": 7.654625869212146e-05, + "loss": 0.6453, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.39421939332444644, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6808, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.3784931025230661, + "learning_rate": 7.6210429741257e-05, + "loss": 0.6368, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.3586567232964291, + "learning_rate": 7.604262157407007e-05, + "loss": 0.6439, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4154311155365181, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6845, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.47099031407025466, + "learning_rate": 7.570722036168854e-05, + "loss": 0.6661, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.374356321105018, + "learning_rate": 7.55396283180529e-05, + "loss": 0.6592, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.36033775907891485, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7013, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.4115680582233992, + "learning_rate": 7.520466385816671e-05, + "loss": 0.7397, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.5948169725433807, + "learning_rate": 7.503729244217086e-05, + "loss": 0.8126, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.39424201853387325, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6381, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.38596472037517565, + "learning_rate": 7.470277373705461e-05, + "loss": 0.6297, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.46693697863083733, + "learning_rate": 7.453562744685778e-05, + "loss": 0.7187, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.38993864802943057, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6778, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.5095305964683324, + "learning_rate": 7.42015634868062e-05, + "loss": 0.7064, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.4702317248701071, + "learning_rate": 7.403464681451715e-05, + "loss": 0.687, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.39116626359279505, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6652, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.41385588550831426, + "learning_rate": 7.370104657760361e-05, + "loss": 0.7124, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.44730105821365734, + "learning_rate": 7.353436400916004e-05, + "loss": 0.6595, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4260691895277321, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6834, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.40502517650835107, + "learning_rate": 7.320123646099519e-05, + "loss": 0.6344, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.3648254148708204, + "learning_rate": 7.303479247604332e-05, + "loss": 0.6619, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.33968598750408824, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6153, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.4376866940534957, + "learning_rate": 7.270214656953415e-05, + "loss": 0.6884, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.386730695532095, + "learning_rate": 7.253594564130804e-05, + "loss": 0.6565, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.45854972772706987, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7372, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.44580155415378203, + "learning_rate": 7.22037903164173e-05, + "loss": 0.6983, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.3471943361728626, + "learning_rate": 7.203783691161883e-05, + "loss": 0.6468, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.371897283706324, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6417, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.37634830715800677, + "learning_rate": 7.170618109512465e-05, + "loss": 0.6456, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.3643292769279553, + "learning_rate": 7.154047967380354e-05, + "loss": 0.577, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.37373161239745434, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6718, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.44290810246790635, + "learning_rate": 7.12093322790597e-05, + "loss": 0.8329, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.3768964282090524, + "learning_rate": 7.104388729449338e-05, + "loss": 0.6526, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.436605121674996, + "learning_rate": 7.087852877727481e-05, + "loss": 0.7088, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.4067130810225034, + "learning_rate": 7.071325722118963e-05, + "loss": 0.6762, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.3883653454852225, + "learning_rate": 7.054807311976379e-05, + "loss": 0.724, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.46747281420734993, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6873, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.41205043009038744, + "learning_rate": 7.021796925368667e-05, + "loss": 0.6204, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.3242277894532913, + "learning_rate": 7.005305047477566e-05, + "loss": 0.5768, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.47407899824723065, + "learning_rate": 6.988822112200156e-05, + "loss": 0.7422, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.39820363944011855, + "learning_rate": 6.972348168756983e-05, + "loss": 0.6312, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.40064410648902177, + "learning_rate": 6.955883266341741e-05, + "loss": 0.6727, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.456453597260157, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7063, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.41770655843835774, + "learning_rate": 6.922980781234699e-05, + "loss": 0.7075, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.3239776965801339, + "learning_rate": 6.906543296794714e-05, + "loss": 0.591, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.39484658036286896, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6829, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.41105779736446446, + "learning_rate": 6.873696089565786e-05, + "loss": 0.6334, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.38249004175965406, + "learning_rate": 6.85728646486359e-05, + "loss": 0.6742, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3799212702114044, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6081, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.3835122723400976, + "learning_rate": 6.82449541829174e-05, + "loss": 0.6256, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.4135617148717006, + "learning_rate": 6.80811409434113e-05, + "loss": 0.6985, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.40148384921597696, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6893, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.39123862561955897, + "learning_rate": 6.775380089695986e-05, + "loss": 0.6104, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.41887583289624125, + "learning_rate": 6.759027506750158e-05, + "loss": 0.6715, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.40638348485919545, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7107, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.4083949119435472, + "learning_rate": 6.726351423768322e-05, + "loss": 0.7447, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.37205781132951976, + "learning_rate": 6.710028021308061e-05, + "loss": 0.6352, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.38742841472003764, + "learning_rate": 6.693714443203507e-05, + "loss": 0.674, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.4033401083647566, + "learning_rate": 6.677410738169485e-05, + "loss": 0.6006, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.37598660488854835, + "learning_rate": 6.661116954891328e-05, + "loss": 0.6619, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.38121772986234537, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6168, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.38586387962012575, + "learning_rate": 6.62855934819569e-05, + "loss": 0.6581, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.38770106756014167, + "learning_rate": 6.612295622000162e-05, + "loss": 0.652, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.4030686783354466, + "learning_rate": 6.59604201200412e-05, + "loss": 0.7031, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.39957943934094087, + "learning_rate": 6.579798566743314e-05, + "loss": 0.6753, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.4026404387336808, + "learning_rate": 6.563565334723134e-05, + "loss": 0.6309, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.41969507174402737, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7228, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.39514615047582613, + "learning_rate": 6.531129704273604e-05, + "loss": 0.6668, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.4140566547772474, + "learning_rate": 6.514927402701964e-05, + "loss": 0.7516, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.42045406988313533, + "learning_rate": 6.498735508086093e-05, + "loss": 0.7, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.38178455221971197, + "learning_rate": 6.48255406877745e-05, + "loss": 0.6577, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.4019039081897145, + "learning_rate": 6.466383133096267e-05, + "loss": 0.6716, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3588840369768513, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6251, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.4302669315413597, + "learning_rate": 6.434072965740242e-05, + "loss": 0.6699, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.3810964054868667, + "learning_rate": 6.417933830548467e-05, + "loss": 0.6837, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.34145638426913544, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6012, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.4009564732786252, + "learning_rate": 6.385687698106781e-05, + "loss": 0.6984, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.44252840493030354, + "learning_rate": 6.369580797148718e-05, + "loss": 0.6296, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.48871443302150325, + "learning_rate": 6.35348473717345e-05, + "loss": 0.7097, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.389317956940689, + "learning_rate": 6.337399566246257e-05, + "loss": 0.6922, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.3855175133639868, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6778, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.44817808009371796, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7177, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.45834516083898486, + "learning_rate": 6.289209867917312e-05, + "loss": 0.7301, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.41103896454877714, + "learning_rate": 6.273168733182722e-05, + "loss": 0.6816, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4164157056477641, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7342, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.38471632755102275, + "learning_rate": 6.241119898233144e-05, + "loss": 0.6227, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.4266485507533036, + "learning_rate": 6.225112293720836e-05, + "loss": 0.7651, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.4053459194807253, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6023, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.36870562674552515, + "learning_rate": 6.19313094962673e-05, + "loss": 0.6791, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.4351844206500832, + "learning_rate": 6.177157305546078e-05, + "loss": 0.6601, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.40422879255325983, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6963, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.4106921388255115, + "learning_rate": 6.145244311816063e-05, + "loss": 0.6799, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.38328431141759234, + "learning_rate": 6.129305057463741e-05, + "loss": 0.7202, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.3796663367342876, + "learning_rate": 6.113377361594049e-05, + "loss": 0.648, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.40038403068977274, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6987, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.42132201533530983, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.6697, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.4360505688893721, + "learning_rate": 6.065664100332478e-05, + "loss": 0.5691, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.38331061780435666, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.6395, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.38488621036550796, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.6883, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.39389684709596867, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6414, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.42512107820513184, + "learning_rate": 6.002211118886514e-05, + "loss": 0.7008, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.3706379623136382, + "learning_rate": 5.986377600199371e-05, + "loss": 0.6591, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3570458684110341, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6095, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.8271381482491155, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.6464, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.43554154795344535, + "learning_rate": 5.938949144798279e-05, + "loss": 0.6337, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3926311897135617, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6868, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.38478540025206204, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6421, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.4253999798599033, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.7009, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4520283482029617, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6557, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.4481957998629243, + "learning_rate": 5.860144885064751e-05, + "loss": 0.6792, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.36804510383786965, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.5613, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.43784577616548054, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7515, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.3537276196215712, + "learning_rate": 5.813010299610313e-05, + "loss": 0.6131, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.350357203269661, + "learning_rate": 5.797323714580192e-05, + "loss": 0.6, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.4368660304973895, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6682, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.4495026254782156, + "learning_rate": 5.765988240812921e-05, + "loss": 0.7361, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.38677664651139976, + "learning_rate": 5.750339445648252e-05, + "loss": 0.6633, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3828365457034956, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7032, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.38427762390030007, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.6446, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.45968469339016016, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.674, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.39687146622532793, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6928, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.38722431254428086, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.6573, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.3371248323421096, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.5685, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.39236805124952945, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6556, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.3564760869939289, + "learning_rate": 5.625609846363622e-05, + "loss": 0.6324, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.4518864695946772, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.7223, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.40173947579151, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6502, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.41437685149036163, + "learning_rate": 5.579050500768836e-05, + "loss": 0.7034, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.3742027548508857, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.6564, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.4294903032274521, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6838, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.3575554784385886, + "learning_rate": 5.53260996957381e-05, + "loss": 0.6564, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.4113853938730298, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.6645, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4212484075042805, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7032, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.3604563221522146, + "learning_rate": 5.486289500882355e-05, + "loss": 0.6126, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.3911411320130094, + "learning_rate": 5.47087624046575e-05, + "loss": 0.6888, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.46634873634767904, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7848, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.3746195868623726, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.6348, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.45268404733657536, + "learning_rate": 5.424717791025302e-05, + "loss": 0.6367, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.40228550414974823, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6452, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.38298979887710355, + "learning_rate": 5.394013727258254e-05, + "loss": 0.654, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.40411342029411973, + "learning_rate": 5.378682303724435e-05, + "loss": 0.6474, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.33999023185508753, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6017, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.4456025315628354, + "learning_rate": 5.348060902265871e-05, + "loss": 0.7676, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.3865625631324515, + "learning_rate": 5.332771015781275e-05, + "loss": 0.6322, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.36785554514355084, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6363, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.5623616443529484, + "learning_rate": 5.302233099590928e-05, + "loss": 0.831, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.36364798127137493, + "learning_rate": 5.286985161076029e-05, + "loss": 0.6062, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3644481549404707, + "learning_rate": 5.271751296338823e-05, + "loss": 0.5689, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.42209112129325826, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.6826, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.3934119923502396, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6653, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.39861550420020664, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6519, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.5533346648181713, + "learning_rate": 5.210957484346314e-05, + "loss": 0.7071, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.72441566042067, + "learning_rate": 5.195794670011776e-05, + "loss": 0.6945, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4496207841402144, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6927, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.35741518746187967, + "learning_rate": 5.165512124837344e-05, + "loss": 0.6388, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.37513644438477145, + "learning_rate": 5.150392484425728e-05, + "loss": 0.6884, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.6741583897905343, + "learning_rate": 5.135287325678271e-05, + "loss": 0.707, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.39833802909880506, + "learning_rate": 5.120196693701267e-05, + "loss": 0.6929, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.35463548537829404, + "learning_rate": 5.105120633557634e-05, + "loss": 0.5978, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.5320189322195104, + "learning_rate": 5.090059190266779e-05, + "loss": 0.8426, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.46625066246950425, + "learning_rate": 5.075012408804458e-05, + "loss": 0.7109, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.4070584110611591, + "learning_rate": 5.059980334102637e-05, + "loss": 0.6713, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.42093544577227315, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6742, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.4103878154272117, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.7077, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.3828060499876506, + "learning_rate": 5.014972799220403e-05, + "loss": 0.6372, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.38990894323417574, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6586, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.37954425450696466, + "learning_rate": 4.985042131538545e-05, + "loss": 0.6697, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.3532334908994542, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.592, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4195420438639813, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6759, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.37941635847563, + "learning_rate": 4.940258557148765e-05, + "loss": 0.5741, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.4648313404904717, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.6948, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.3711456063807938, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.5983, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.3654848182579989, + "learning_rate": 4.895610964891923e-05, + "loss": 0.6036, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.37029369470976586, + "learning_rate": 4.880758859890536e-05, + "loss": 0.5892, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3922062442904201, + "learning_rate": 4.865922041720239e-05, + "loss": 0.692, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.4967936001419372, + "learning_rate": 4.851100554686021e-05, + "loss": 0.744, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.3931460680340042, + "learning_rate": 4.836294443047088e-05, + "loss": 0.6686, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4280181545183494, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6806, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.42223312575635014, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.6923, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.3809484000051097, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6413, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4210284404515193, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6599, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.3607822198484989, + "learning_rate": 4.762496061632814e-05, + "loss": 0.627, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.366669715798918, + "learning_rate": 4.747783129228656e-05, + "loss": 0.6031, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.40441128854705194, + "learning_rate": 4.733085880741301e-05, + "loss": 0.63, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.3612029088391989, + "learning_rate": 4.718404360058966e-05, + "loss": 0.6508, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.3901357281304347, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.6449, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.36044613359339933, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6401, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.36899814505997225, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.6334, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.391995130463381, + "learning_rate": 4.659836431497563e-05, + "loss": 0.6472, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.4340878228899344, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7035, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.4423501611222201, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6572, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.38532884886931085, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.6644, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.44444047693909494, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.7538, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.4262671758385985, + "learning_rate": 4.586985643347717e-05, + "loss": 0.7397, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.4626250448381168, + "learning_rate": 4.572463804170263e-05, + "loss": 0.6834, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.375971749447931, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6049, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.38727589689119296, + "learning_rate": 4.543468791472131e-05, + "loss": 0.6471, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.4009341292741956, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.7063, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3975197832083184, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6391, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.37280124273004256, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.6313, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.4373590508588539, + "learning_rate": 4.485674639850333e-05, + "loss": 0.7093, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.38578450678888154, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6489, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.4853093037820455, + "learning_rate": 4.456876191254582e-05, + "loss": 0.7684, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.4251998933688053, + "learning_rate": 4.442501774383515e-05, + "loss": 0.6282, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.42300865697478757, + "learning_rate": 4.428143953045717e-05, + "loss": 0.663, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.43778930703677116, + "learning_rate": 4.413802770115816e-05, + "loss": 0.701, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.39122495916107725, + "learning_rate": 4.399478268418771e-05, + "loss": 0.6524, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.45307591571359396, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6816, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.38140562797791966, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.6353, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.4068793642660578, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6598, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.39167806307515934, + "learning_rate": 4.342347928711953e-05, + "loss": 0.668, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.43109647072709595, + "learning_rate": 4.328107473805487e-05, + "loss": 0.7076, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.36208296146563684, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.6429, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.47041028455487194, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6851, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.3664645579737971, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.5857, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.4384290774847837, + "learning_rate": 4.271315449981934e-05, + "loss": 0.7168, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.4242032275841449, + "learning_rate": 4.257160104963696e-05, + "loss": 0.673, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.3634144295847843, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.6246, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.40844376488531203, + "learning_rate": 4.228900904120895e-05, + "loss": 0.6668, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.5714199677512499, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6787, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.39084600958483035, + "learning_rate": 4.200710636738189e-05, + "loss": 0.662, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.404196719321763, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.6633, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.4421065143917544, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6901, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.35196517976013403, + "learning_rate": 4.158555222253771e-05, + "loss": 0.6108, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.37411979187532646, + "learning_rate": 4.14453824841132e-05, + "loss": 0.6329, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.42420550637332644, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6884, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.42182543477882817, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.6573, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.39889234567331394, + "learning_rate": 4.102592405835536e-05, + "loss": 0.6751, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.39676110200169845, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6921, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.3938882372029621, + "learning_rate": 4.074716493968975e-05, + "loss": 0.6185, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.3869942510770041, + "learning_rate": 4.060805057932359e-05, + "loss": 0.5917, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.3922253159288967, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6634, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.41825733938280657, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.7077, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.43317183056056374, + "learning_rate": 4.019177327749822e-05, + "loss": 0.7164, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.36150784828671517, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6391, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.36723145086122616, + "learning_rate": 3.991514736790258e-05, + "loss": 0.6315, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.4103678298964808, + "learning_rate": 3.977710334046193e-05, + "loss": 0.6759, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.4196417562429959, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6958, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.39654831771035837, + "learning_rate": 3.950155520139581e-05, + "loss": 0.6697, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.3943983098689371, + "learning_rate": 3.936405191259891e-05, + "loss": 0.6214, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.39397632964641327, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6901, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.4496753311469288, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.7014, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.37580320728220684, + "learning_rate": 3.895263009479534e-05, + "loss": 0.6636, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.42177084717029684, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6555, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.40646810663190575, + "learning_rate": 3.867925968395085e-05, + "loss": 0.6469, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.41365508067329304, + "learning_rate": 3.854284894414122e-05, + "loss": 0.6685, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.34714694057200485, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6352, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.4151687622915957, + "learning_rate": 3.82705784324618e-05, + "loss": 0.6491, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.3632469527368112, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.6121, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3920323388470978, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6097, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.39744170045940946, + "learning_rate": 3.786355617847385e-05, + "loss": 0.6475, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.446471427295853, + "learning_rate": 3.772825265187802e-05, + "loss": 0.6765, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4063003989199862, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6193, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.37094394307109213, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.6314, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.3863751730451606, + "learning_rate": 3.732345940279893e-05, + "loss": 0.6754, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.3946416135622672, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6659, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.3668017613796513, + "learning_rate": 3.705453237352227e-05, + "loss": 0.6359, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.3532747947569028, + "learning_rate": 3.692035060534088e-05, + "loss": 0.6381, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.49264762378255045, + "learning_rate": 3.678635720256737e-05, + "loss": 0.7229, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.3979815736399481, + "learning_rate": 3.665255256532638e-05, + "loss": 0.6503, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.35860055835757615, + "learning_rate": 3.651893709317887e-05, + "loss": 0.5768, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.42806578552403296, + "learning_rate": 3.638551118512089e-05, + "loss": 0.7102, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.3711673262227997, + "learning_rate": 3.625227523958252e-05, + "loss": 0.6037, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.4260368302830914, + "learning_rate": 3.611922965442648e-05, + "loss": 0.6336, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.41828555619842467, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7052, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.3678885155752684, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.6019, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.40274233978981605, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.7069, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4600956157514961, + "learning_rate": 3.558895885496023e-05, + "loss": 0.7972, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.40978340103739513, + "learning_rate": 3.545687101972013e-05, + "loss": 0.6009, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.4443155799954472, + "learning_rate": 3.53249759200601e-05, + "loss": 0.7042, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.3615102462432067, + "learning_rate": 3.519327394983888e-05, + "loss": 0.5927, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.7930685099141982, + "learning_rate": 3.506176550233863e-05, + "loss": 0.681, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.4077551476687731, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6321, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.41118679347968756, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6219, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.39624353753623753, + "learning_rate": 3.46684052203088e-05, + "loss": 0.6079, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.4068515623858212, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.6467, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.36293900621263614, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6336, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.4069638502104282, + "learning_rate": 3.427680074531113e-05, + "loss": 0.6369, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.3858394209281249, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.647, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.3643715909820167, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6538, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.3771449057141627, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6428, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.42924005537930676, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.6639, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.38883926038556477, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6504, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.4096661812753556, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.6578, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.36693699193296664, + "learning_rate": 3.336994413891828e-05, + "loss": 0.6088, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3796701162868268, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6002, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.4828987134519485, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.6435, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.37167365428892346, + "learning_rate": 3.298426809706928e-05, + "loss": 0.6004, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.42296109076969385, + "learning_rate": 3.285610914348332e-05, + "loss": 0.728, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.41048258922277014, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.6761, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.42100029204947237, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.677, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.4329414151995111, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6369, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.3863961244631767, + "learning_rate": 3.234548216567049e-05, + "loss": 0.659, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.38246873643359003, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.6679, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.40424047037882044, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6732, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.38861598750942106, + "learning_rate": 3.196463187590929e-05, + "loss": 0.6913, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.4260659280740861, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.6512, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.39337819894073167, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6274, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.45108459949637103, + "learning_rate": 3.158561005793402e-05, + "loss": 0.6799, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.427549109761383, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6847, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.40913046484681276, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6988, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.3766080702153803, + "learning_rate": 3.120842689807468e-05, + "loss": 0.6039, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.40369322638801514, + "learning_rate": 3.108310952230212e-05, + "loss": 0.6407, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.38091158526163255, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6456, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.3883533040585799, + "learning_rate": 3.083309253324651e-05, + "loss": 0.6068, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.36799108174224, + "learning_rate": 3.070839366655215e-05, + "loss": 0.6364, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.42384231178399634, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6556, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.50356351760391, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.6359, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.5604868933950006, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6685, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.4299467016549516, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6582, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.5161869734884429, + "learning_rate": 3.008801048763914e-05, + "loss": 0.737, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.4559428429138101, + "learning_rate": 2.996455867635155e-05, + "loss": 0.7054, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.4092761318759988, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6518, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.39530471957129254, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6611, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.42085708140120626, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.6925, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.44992897855631964, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6549, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.4092292675638527, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6093, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.39009926160521424, + "learning_rate": 2.922825253307947e-05, + "loss": 0.6005, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.36085329648766035, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6585, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.4369052587142333, + "learning_rate": 2.898450393337977e-05, + "loss": 0.6494, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.3744016775256687, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.5725, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.4121362965207311, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6017, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.37726568920550346, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.6836, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.4418938398867874, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.703, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.47506338936522274, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6722, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.44442248833841674, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.6797, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.4621149821698225, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.7252, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3907540229713785, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6067, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.38864671047408117, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.6273, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.4237723464624118, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.6868, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.4190239875572504, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.657, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.37781107305407086, + "learning_rate": 2.753992680872457e-05, + "loss": 0.5972, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.38874215364153275, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6169, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.36940896101229764, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6558, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.4291282188176774, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.6454, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.3839284709830824, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.6598, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.4108258871751767, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6245, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.41932992628992855, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.7086, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.3731122474497691, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.5769, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.40271118418721064, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6305, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.3759988991491332, + "learning_rate": 2.647690737490106e-05, + "loss": 0.6101, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.357343527288196, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.5465, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.38925040840668906, + "learning_rate": 2.6243086879379e-05, + "loss": 0.5655, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.48160768866848636, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.641, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.4244775309893954, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6801, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.4052513125568629, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6314, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.4530633727705138, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6483, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.37044257317667545, + "learning_rate": 2.566239608465838e-05, + "loss": 0.5936, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.41393619426439865, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6154, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.4321874501000778, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6521, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.4096361984440969, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.6606, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.41634198041514003, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.7042, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.3915463794585155, + "learning_rate": 2.508725484101684e-05, + "loss": 0.6521, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.4071448592159562, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.6251, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.42708533840760865, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6636, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.41917061115204585, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.6674, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.4201973493458763, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.6569, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.43340039241864153, + "learning_rate": 2.451770608467432e-05, + "loss": 0.7164, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.3991819504268546, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.6478, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.3983915458949484, + "learning_rate": 2.429146201687538e-05, + "loss": 0.678, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.42485343295316874, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6346, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.3772056934359643, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6295, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.45201756095626, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.6335, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.37976490138332747, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6227, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.3902287384849779, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.6666, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.3442031570950758, + "learning_rate": 2.361816641743303e-05, + "loss": 0.5574, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.4228465689249216, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6154, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.44808713358890007, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6696, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.4116323837110524, + "learning_rate": 2.328459328616759e-05, + "loss": 0.6117, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.4118234638399567, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6487, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.3996291046457449, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6403, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.4054129911232635, + "learning_rate": 2.295308190543859e-05, + "loss": 0.6725, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.37006654804032474, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.5553, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.4637948919550711, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.7438, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.40112128444826517, + "learning_rate": 2.262364118471805e-05, + "loss": 0.6543, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.41915403147154934, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6369, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.3920247905350215, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.6751, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.3726364042887898, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.5884, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.44360332125933893, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6872, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.44282322726263496, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6916, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.3663176808737401, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.6582, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.4173314589810813, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7487, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.4177413822287383, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.6562, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.4018296608992928, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.6825, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.577563320408092, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6675, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.4187052591946463, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6964, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.45419004967260235, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.7142, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.4749285727661252, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.7697, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.403606872806351, + "learning_rate": 2.111388852214001e-05, + "loss": 0.599, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.3800456307931478, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.6197, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.39692274903271046, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6435, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.3500391944449609, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.5917, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.370806006504793, + "learning_rate": 2.069097260929439e-05, + "loss": 0.6059, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.4532424951174448, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6522, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.3812118982948848, + "learning_rate": 2.048093436450603e-05, + "loss": 0.6228, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.4042590705033425, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.5912, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.37605121562082366, + "learning_rate": 2.027184594300898e-05, + "loss": 0.5777, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.41893809793390974, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.6606, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.4308812653975276, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.6721, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.4086330696324356, + "learning_rate": 1.995999968955641e-05, + "loss": 0.7012, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.4317896937328017, + "learning_rate": 1.985652854842247e-05, + "loss": 0.7462, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.38177599549512226, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.6161, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.4087168870395883, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6338, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.4577575610849982, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.6694, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.5402252955049723, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.7686, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4564488714688111, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.7128, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.4256599732779054, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.6829, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.43274994041108594, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.6206, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.36086439574030127, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6216, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.42244229931003574, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.6722, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.42155871559748825, + "learning_rate": 1.883503039577894e-05, + "loss": 0.682, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.371315898744865, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6007, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.43003284582729545, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.6504, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.36370255523634354, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.639, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4704806704812013, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6655, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.3815879147078225, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.6149, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.4527025181288094, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.748, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.4897101550964116, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.7656, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.37319089791901433, + "learning_rate": 1.803526775107217e-05, + "loss": 0.5999, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.3758775079088501, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.6063, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.40838567930237774, + "learning_rate": 1.783776873795994e-05, + "loss": 0.7114, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.3973963251318473, + "learning_rate": 1.773938710748706e-05, + "loss": 0.6397, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.3579607168349245, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.6355, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3824067101446628, + "learning_rate": 1.754336106761927e-05, + "loss": 0.5695, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.4925954361146254, + "learning_rate": 1.744571724358789e-05, + "loss": 0.6677, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.4276981555335783, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.6416, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.43558168937581576, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6592, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.4238141183231073, + "learning_rate": 1.715426605184407e-05, + "loss": 0.6972, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.4244831238104071, + "learning_rate": 1.705761004839911e-05, + "loss": 0.6191, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.6300763843676416, + "learning_rate": 1.696120172352025e-05, + "loss": 0.625, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.3967432502181859, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6343, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.3996275527322477, + "learning_rate": 1.676912926028007e-05, + "loss": 0.6115, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.43478016649919016, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.683, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.3935437242031588, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.6435, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.3912390364000557, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.6355, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.4210375910070289, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6788, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.38713562077626823, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.6462, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.37313864212152587, + "learning_rate": 1.619888594394382e-05, + "loss": 0.6321, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4133781840555599, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.7128, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.41490502511206495, + "learning_rate": 1.601080376443763e-05, + "loss": 0.5811, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.4230171985342458, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.701, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.3758267013887235, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.5755, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.4194968203142697, + "learning_rate": 1.573056222621453e-05, + "loss": 0.6747, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.3682826299485392, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.5932, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.38930004217634334, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6158, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.4158573333782977, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.692, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.3742543536831887, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6646, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4565482831683287, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6546, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.4648477002260928, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6678, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.39928863792951164, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.6211, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.4462595214667838, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6837, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.3651232552582547, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.5964, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.39227730756186174, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.6489, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3870925483846158, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6095, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.42167014093711325, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.6827, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.40072319805451717, + "learning_rate": 1.454244833620102e-05, + "loss": 0.657, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.39007928494064764, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.5955, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.37856632658691275, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.6409, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.4041436839925096, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.6301, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3959519258693735, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.664, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.4337337537945873, + "learning_rate": 1.409693244743192e-05, + "loss": 0.6291, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.3433934396705534, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.5915, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4323739867974528, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6997, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.36787797117790366, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.5925, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.3485124873011298, + "learning_rate": 1.37451354812416e-05, + "loss": 0.5565, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.424398798101433, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.632, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.3828073131985413, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.6554, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.33559104771661646, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.5688, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4400788944876675, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6421, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.4182109230731653, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.6735, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.39126204447831453, + "learning_rate": 1.322517230541096e-05, + "loss": 0.6732, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.35938102444571973, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6573, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.4034326249578852, + "learning_rate": 1.30539214797198e-05, + "loss": 0.602, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.3653789457849746, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.5865, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.45162387923875585, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6558, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.42801086323324233, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.6498, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.42856373941025483, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.6719, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.4210817050468532, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6711, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.36419654040815, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.5764, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.38983772601087396, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6686, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.40031493797764406, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6085, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.42784491520692725, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.5968, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.39096323767038194, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.6616, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.4565965115949712, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6841, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.4562508675055286, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.7019, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.4109227775066737, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.7138, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.3804421012815039, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6869, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.37864548372426227, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.6293, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.3899734187282974, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.6219, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.46146026784741373, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7349, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.39563096569301015, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.6012, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.36349028623938273, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.6323, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5530220253152613, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7134, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.3633381559014149, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.5883, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.524058103797069, + "learning_rate": 1.123914688596409e-05, + "loss": 0.6618, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.409430667623594, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6635, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.3302131801718762, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.5556, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.39402409614544565, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.6399, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3859745387849683, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6593, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.318991288535117, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.5662, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.39710258533822207, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.6379, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.37860102198584894, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.5985, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.36232533598447775, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.5908, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.43460142571501387, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.6522, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3876925245069668, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6188, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.41730345165972554, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.6449, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.3698623883860055, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.6304, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.3696486364986156, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6236, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.37919659148999296, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.6652, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.3969738606412844, + "learning_rate": 1.007519208596045e-05, + "loss": 0.63, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3890817919498963, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6088, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.375946683342908, + "learning_rate": 9.924546254786493e-06, + "loss": 0.5831, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.42534835273590144, + "learning_rate": 9.849626695403324e-06, + "loss": 0.6552, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.3813542531242084, + "learning_rate": 9.774976338718677e-06, + "loss": 0.5825, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.48512387811630203, + "learning_rate": 9.700595407649805e-06, + "loss": 0.7155, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.44487062533728505, + "learning_rate": 9.62648412430951e-06, + "loss": 0.6486, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.4144096224359702, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6182, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.4275581658139899, + "learning_rate": 9.479071385238892e-06, + "loss": 0.6368, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.41217208846370146, + "learning_rate": 9.40577036970538e-06, + "loss": 0.5757, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.4311514560125804, + "learning_rate": 9.332739882292752e-06, + "loss": 0.7516, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.4441582542928189, + "learning_rate": 9.259980141081115e-06, + "loss": 0.6473, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.42763724124452335, + "learning_rate": 9.187491363342093e-06, + "loss": 0.6022, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.4370781023309805, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6075, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.42278656845369844, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6556, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.41799589574914214, + "learning_rate": 8.971652971536148e-06, + "loss": 0.6304, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.4095696789811881, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6205, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.3842838477008584, + "learning_rate": 8.829119474567671e-06, + "loss": 0.6719, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.40098624249490195, + "learning_rate": 8.758260995011825e-06, + "loss": 0.6649, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.40500561553935305, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6583, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.36756166218603953, + "learning_rate": 8.617361631727138e-06, + "loss": 0.6197, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.5799918647625846, + "learning_rate": 8.547321168745193e-06, + "loss": 0.6944, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.39258357313133585, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6348, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.42613114420474996, + "learning_rate": 8.408059725858719e-06, + "loss": 0.6787, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.36014420147296194, + "learning_rate": 8.338839161809997e-06, + "loss": 0.6289, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.38731797911735133, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6178, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.4359243445420513, + "learning_rate": 8.201219382016556e-06, + "loss": 0.6695, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.36746766889294014, + "learning_rate": 8.132820577225387e-06, + "loss": 0.62, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.4188819623793009, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6775, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.3837328531647139, + "learning_rate": 7.996846159099557e-06, + "loss": 0.5817, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.4037215580067267, + "learning_rate": 7.929270951805178e-06, + "loss": 0.6739, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3890101154213983, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6209, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.3818105226730483, + "learning_rate": 7.794945549701993e-06, + "loss": 0.665, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.4446305625611964, + "learning_rate": 7.728195756009204e-06, + "loss": 0.6634, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.45066700064603565, + "learning_rate": 7.661721499929753e-06, + "loss": 0.732, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.5506659865294842, + "learning_rate": 7.595522979965819e-06, + "loss": 0.8025, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.3738317938225511, + "learning_rate": 7.529600393796232e-06, + "loss": 0.6668, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.4733935783945023, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7105, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.44189523647167056, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.648, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.4445711433398203, + "learning_rate": 7.333490202478666e-06, + "loss": 0.7302, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.3728209155473441, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.605, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.4135229458348511, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6648, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.534394080276321, + "learning_rate": 7.1398704525792e-06, + "loss": 0.7504, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.44158154206796363, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6495, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.34078346470207904, + "learning_rate": 7.012176770311862e-06, + "loss": 0.5454, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.3555839691863366, + "learning_rate": 6.948746347689183e-06, + "loss": 0.5889, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.371060102479731, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.631, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.39526111909680517, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.6089, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.4283808255240205, + "learning_rate": 6.760123024328624e-06, + "loss": 0.6206, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3779760877382535, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5989, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.3440769727719932, + "learning_rate": 6.635765971293484e-06, + "loss": 0.5457, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.44655105361023856, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.6729, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.36877895299568914, + "learning_rate": 6.512524116523633e-06, + "loss": 0.5614, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.37325699927081074, + "learning_rate": 6.451321849032288e-06, + "loss": 0.5595, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.3786447935064768, + "learning_rate": 6.390398932093555e-06, + "loss": 0.6037, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.38025555154341495, + "learning_rate": 6.329755547632499e-06, + "loss": 0.5795, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.4152252835746442, + "learning_rate": 6.269391876739495e-06, + "loss": 0.6059, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.414720058216093, + "learning_rate": 6.209308099669597e-06, + "loss": 0.6171, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3805423945862187, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6218, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.4532193207806232, + "learning_rate": 6.089980943839924e-06, + "loss": 0.7583, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.41179295805269606, + "learning_rate": 6.030737921409169e-06, + "loss": 0.6657, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4280068243529339, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6707, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.3545547059777766, + "learning_rate": 5.913093872058528e-06, + "loss": 0.5767, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.395744975669382, + "learning_rate": 5.854693196441641e-06, + "loss": 0.6055, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.40435885790982506, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6379, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.3814437771569836, + "learning_rate": 5.738735415290642e-06, + "loss": 0.618, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.811008855038527, + "learning_rate": 5.681178656024055e-06, + "loss": 0.6972, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.42911928712400943, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6791, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.5597486842534753, + "learning_rate": 5.566910259474289e-06, + "loss": 0.6987, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.41208830819453646, + "learning_rate": 5.510198963413881e-06, + "loss": 0.5736, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4034401621145211, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6322, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.4093471387081946, + "learning_rate": 5.397623022464226e-06, + "loss": 0.6105, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.4238200528582358, + "learning_rate": 5.341758713743828e-06, + "loss": 0.6439, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.42386097679209056, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6438, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.4599432589024242, + "learning_rate": 5.230878253907912e-06, + "loss": 0.6614, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.4833338053488303, + "learning_rate": 5.175862433898282e-06, + "loss": 0.7221, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4240188291338941, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6379, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.41703674870985313, + "learning_rate": 5.066680435123106e-06, + "loss": 0.6714, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.3967394191445386, + "learning_rate": 5.012514582391592e-06, + "loss": 0.6442, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4089089452183383, + "learning_rate": 4.95863237670956e-06, + "loss": 0.5991, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.3767937158524674, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6341, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.38585512834586744, + "learning_rate": 4.851719549248301e-06, + "loss": 0.6273, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.4586049348519703, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6217, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.40138686888256814, + "learning_rate": 4.745943229770122e-06, + "loss": 0.6547, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.43601738666780193, + "learning_rate": 4.693481655885257e-06, + "loss": 0.6823, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.420956972882786, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6285, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.38614646380632833, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6359, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.4449228643853392, + "learning_rate": 4.537805154995278e-06, + "loss": 0.7102, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3816940892253165, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6253, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.4106320807724667, + "learning_rate": 4.435445885824285e-06, + "loss": 0.5993, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.40646652543853595, + "learning_rate": 4.384694230432984e-06, + "loss": 0.6506, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3710293297476454, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6178, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.44519568420444267, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.6283, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.37961540701591967, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.636, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.43152261029008687, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6442, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.3839027909818481, + "learning_rate": 4.135221781914034e-06, + "loss": 0.6797, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.43003305226010224, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.5905, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.40204696343519214, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6574, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.3901838374632061, + "learning_rate": 3.988972323910778e-06, + "loss": 0.634, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.441275045415316, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.7004, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.374899335238422, + "learning_rate": 3.892905960127546e-06, + "loss": 0.5719, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.41052867678399346, + "learning_rate": 3.845303192289074e-06, + "loss": 0.6396, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.39889174261094934, + "learning_rate": 3.797987556970495e-06, + "loss": 0.6237, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4486853289180069, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6717, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.41471453140962095, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.5692, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.393165175176718, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.6424, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.40706528759649374, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6497, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.4979494630382395, + "learning_rate": 3.565721283350931e-06, + "loss": 0.7129, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.3753930789086421, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.6348, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3559295297272381, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6077, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.46054781187370936, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6117, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.38305639769805905, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.6267, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.39004070336122815, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6533, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.41520161480351203, + "learning_rate": 3.296506110302422e-06, + "loss": 0.5985, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.41006185649112353, + "learning_rate": 3.252646840332918e-06, + "loss": 0.5896, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4624709342649789, + "learning_rate": 3.209076472645112e-06, + "loss": 0.696, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.4777229066513596, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.704, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.409799860675126, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.6518, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.378772725904361, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6191, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.384915561919326, + "learning_rate": 3.037686613916857e-06, + "loss": 0.6043, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.37544872803810414, + "learning_rate": 2.995562691985898e-06, + "loss": 0.5987, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.38668980561915756, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.5679, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.3710676543224045, + "learning_rate": 2.912183982969385e-06, + "loss": 0.5819, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.42198902606213534, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.6122, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4007596676485821, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6589, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.37845868804855115, + "learning_rate": 2.789290617426765e-06, + "loss": 0.622, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.4458854404524312, + "learning_rate": 2.748906571878207e-06, + "loss": 0.6845, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.49399594676955133, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7664, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.41750762623253024, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.6847, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.38704341257194147, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.6563, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4564098292116241, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6974, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.3884655746811534, + "learning_rate": 2.551344823532964e-06, + "loss": 0.688, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.43801374114327774, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.6638, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.4109793755063495, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6284, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.3840239022455946, + "learning_rate": 2.436298790049363e-06, + "loss": 0.6098, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.47318182719166385, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.67, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4155046383093813, + "learning_rate": 2.3610579436393e-06, + "loss": 0.5981, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.3750203784525229, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.6395, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.36006756331906975, + "learning_rate": 2.286983355164529e-06, + "loss": 0.5712, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4320273428647143, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6927, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.4210155540691878, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.5523, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.4310062764474239, + "learning_rate": 2.178060137750071e-06, + "loss": 0.6648, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.37958233122421753, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6632, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.45581052332137084, + "learning_rate": 2.106905034576112e-06, + "loss": 0.6538, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.41234748540786764, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.6398, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.36560884716950104, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6237, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.4116871346092955, + "learning_rate": 2.002365067264289e-06, + "loss": 0.6125, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.9304496331226776, + "learning_rate": 1.968103545249611e-06, + "loss": 0.7179, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.3864823364743263, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6261, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.43235009161967447, + "learning_rate": 1.900458817025097e-06, + "loss": 0.6229, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.3849197493302353, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.5848, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.40134861331251387, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6163, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.35905598866420474, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.5709, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.43085998890856214, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.6428, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3598609268107039, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6099, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.44991690422363384, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.6848, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.39099610363033765, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.6379, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.40579872459583405, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.645, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.39868018210085365, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.5881, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.44413940180092004, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.7001, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.41125705692688785, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6065, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.37943719234746764, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.5538, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.37926866834963974, + "learning_rate": 1.489364501100332e-06, + "loss": 0.6136, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3805039314657714, + "learning_rate": 1.459798471131868e-06, + "loss": 0.5837, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.42719801385702594, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6021, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.40858697057497695, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.6243, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.4487670605981078, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7142, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.4224541816095461, + "learning_rate": 1.344477780953346e-06, + "loss": 0.6509, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.42497622693465, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.6408, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.4364536219536688, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6738, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.3725518080494089, + "learning_rate": 1.261080262743297e-06, + "loss": 0.6298, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.38823486298009524, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.6046, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.43667157632158776, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.645, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.40600697636904476, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.5678, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.4336830850484576, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6722, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.38487069921133976, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6327, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.36479045347086214, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.591, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.37428770582908677, + "learning_rate": 1.076809502472831e-06, + "loss": 0.5923, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.4314768621928627, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6457, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.4147033583145242, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6127, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.42763395930014286, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6814, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3870328118798832, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5969, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.34963742138630666, + "learning_rate": 9.540479264726676e-07, + "loss": 0.5615, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.3763924966431932, + "learning_rate": 9.303826211592315e-07, + "loss": 0.6232, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4838892027395187, + "learning_rate": 9.070131527609604e-07, + "loss": 0.73, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.40597714008074315, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6208, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.3817389249841561, + "learning_rate": 8.611620049653879e-07, + "loss": 0.6221, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.4032564775019703, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6758, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.368876604368028, + "learning_rate": 8.16495030759501e-07, + "loss": 0.603, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.360160879563739, + "learning_rate": 7.946057760332193e-07, + "loss": 0.5842, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.4175421047933967, + "learning_rate": 7.730127636723539e-07, + "loss": 0.5573, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.46124172093356386, + "learning_rate": 7.517160581569372e-07, + "loss": 0.7338, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.41660671080560063, + "learning_rate": 7.307157230821426e-07, + "loss": 0.599, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.39971810732775126, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6217, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.4240233802100902, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6787, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.4106099086169289, + "learning_rate": 6.694935631773258e-07, + "loss": 0.6475, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.4484919955657655, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7082, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.5168506298073032, + "learning_rate": 6.301617681886863e-07, + "loss": 0.7011, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.37421652043750936, + "learning_rate": 6.109409416834688e-07, + "loss": 0.6033, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3825406307257205, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6274, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.36370105383226464, + "learning_rate": 5.733897176325665e-07, + "loss": 0.6131, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.4362826352937529, + "learning_rate": 5.550594322205504e-07, + "loss": 0.6963, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.38562300339047656, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6073, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.4101889385189077, + "learning_rate": 5.192897883082747e-07, + "loss": 0.6342, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.4084494005339445, + "learning_rate": 5.018505366216175e-07, + "loss": 0.6384, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.393190863886088, + "learning_rate": 4.847084015119574e-07, + "loss": 0.7246, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.4130874391359839, + "learning_rate": 4.678634341683252e-07, + "loss": 0.6573, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.43866952692769, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6903, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.4756296648677467, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6519, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.3920615394297473, + "learning_rate": 4.191120373120749e-07, + "loss": 0.5875, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.3901570236370613, + "learning_rate": 4.034562351727389e-07, + "loss": 0.613, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.39027291245367424, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.618, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.4044380141653884, + "learning_rate": 3.73036907948543e-07, + "loss": 0.6489, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.5909282315563071, + "learning_rate": 3.582734737004101e-07, + "loss": 0.6663, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4864962177965382, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7379, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.37261687755147305, + "learning_rate": 3.296392843612273e-07, + "loss": 0.614, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.5168912163553879, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.6986, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.4050537228349556, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6398, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.46744115057105257, + "learning_rate": 2.889203328748424e-07, + "loss": 0.6044, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.4117992538107304, + "learning_rate": 2.759428007315212e-07, + "loss": 0.6387, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.37111934186497864, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6629, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.4541830541597025, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.6629, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.43280828716229025, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.6523, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.6536403315965769, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.657, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.4258932375978754, + "learning_rate": 2.15522751523467e-07, + "loss": 0.6978, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.39518358503813816, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.6285, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3928729122003721, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6128, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.38598983650286456, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.5799, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.3948251574075436, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.5937, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.426109880344021, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6211, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.4236366004620384, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.6471, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.3762315325341303, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.5637, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.5431328948750804, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.699, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.4327306904052105, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6687, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.3750053372630263, + "learning_rate": 1.170343437301491e-07, + "loss": 0.6314, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4392820289743807, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6429, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.38998112576797356, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.5897, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.40427089101594027, + "learning_rate": 9.330275400666332e-08, + "loss": 0.6488, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.4000013580423989, + "learning_rate": 8.598886661895788e-08, + "loss": 0.625, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.4229711291114719, + "learning_rate": 7.8973337634336e-08, + "loss": 0.6235, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.39532708500394637, + "learning_rate": 7.225618800222877e-08, + "loss": 0.6519, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.430113144669036, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6273, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.4070632065693182, + "learning_rate": 5.971710613821291e-08, + "loss": 0.5986, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.4303476610897834, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6181, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.49460620856969917, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6842, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.3731127474613181, + "learning_rate": 4.314680098592705e-08, + "loss": 0.5763, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.3603271539682308, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.5601, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.4061619820938722, + "learning_rate": 3.359233507459481e-08, + "loss": 0.5951, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.3852902775921212, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.5901, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.44172896935187084, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.6538, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4048509328319604, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6207, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.40031133738130914, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.6316, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.35283342528313516, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.5664, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4048802276679688, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6218, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.4112765662482519, + "learning_rate": 9.555535917993297e-09, + "loss": 0.6512, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.40104607022600364, + "learning_rate": 7.315984495548378e-09, + "loss": 0.6564, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.41389154069855416, + "learning_rate": 5.375026405352035e-09, + "loss": 0.5588, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.36243414198821367, + "learning_rate": 3.732667443390181e-09, + "loss": 0.5944, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.4525820312381907, + "learning_rate": 2.388912514017516e-09, + "loss": 0.6749, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.45642704463778055, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6813, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.39115268818088905, + "learning_rate": 5.972299119250125e-10, + "loss": 0.6345, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.388520173659741, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.6512, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.40510235035171605, + "learning_rate": 0.0, + "loss": 0.6228, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1687683967877120.0, + "train_loss": 0.7127048384984335, + "train_runtime": 29360.6535, + "train_samples_per_second": 1.022, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1687683967877120.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a02d8b3c14d5889ddb631db01ebdbac41ec62e3 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "k_proj", + "v_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e141a974a7f8557f54a8b6076af31572577416d3 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4292f08b8ec20f19c73067870e2f74efe60f1dd89419bb8d5d405c2c00557294 +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..0feecc49996d6d6865779480a5b949e8948bd878 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b8462bc1aee5e57a6201e3313a32350c879a0c4a5883072868e11f4a84398e7 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..65639ca9d4c0ee3cce8cdf60c9525248b28c2fe4 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_30000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,13167 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005333333333333334, + "grad_norm": 0.6944415840873915, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.2371, + "step": 1 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 0.9237516073149323, + "learning_rate": 7.017543859649123e-06, + "loss": 1.346, + "step": 2 + }, + { + "epoch": 0.0016, + "grad_norm": 0.8898403652257272, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4487, + "step": 3 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.8272542005004054, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.3012, + "step": 4 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.672855335636144, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.2082, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 0.7541778585932635, + "learning_rate": 2.105263157894737e-05, + "loss": 1.3162, + "step": 6 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.7218365791439992, + "learning_rate": 2.456140350877193e-05, + "loss": 1.2852, + "step": 7 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.6028645378944271, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.1151, + "step": 8 + }, + { + "epoch": 0.0048, + "grad_norm": 0.6733007950798171, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1556, + "step": 9 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.5602874051811487, + "learning_rate": 3.508771929824561e-05, + "loss": 0.9803, + "step": 10 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.9179182307731018, + "learning_rate": 3.859649122807018e-05, + "loss": 1.1182, + "step": 11 + }, + { + "epoch": 0.0064, + "grad_norm": 0.751234536943168, + "learning_rate": 4.210526315789474e-05, + "loss": 0.9436, + "step": 12 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.6826276028025567, + "learning_rate": 4.56140350877193e-05, + "loss": 0.9327, + "step": 13 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.7852409823004107, + "learning_rate": 4.912280701754386e-05, + "loss": 1.0438, + "step": 14 + }, + { + "epoch": 0.008, + "grad_norm": 0.5992997187296086, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.8858, + "step": 15 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.5874456800487622, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.9388, + "step": 16 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.5822876374605616, + "learning_rate": 5.9649122807017544e-05, + "loss": 0.974, + "step": 17 + }, + { + "epoch": 0.0096, + "grad_norm": 0.5604867750525585, + "learning_rate": 6.31578947368421e-05, + "loss": 0.9109, + "step": 18 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.5783177884247204, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0157, + "step": 19 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.5432343133587975, + "learning_rate": 7.017543859649122e-05, + "loss": 0.9145, + "step": 20 + }, + { + "epoch": 0.0112, + "grad_norm": 0.4983104199001195, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9487, + "step": 21 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.5526259301736156, + "learning_rate": 7.719298245614036e-05, + "loss": 0.9286, + "step": 22 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.4465931752876837, + "learning_rate": 8.070175438596491e-05, + "loss": 0.9112, + "step": 23 + }, + { + "epoch": 0.0128, + "grad_norm": 0.41106759052665426, + "learning_rate": 8.421052631578948e-05, + "loss": 0.8381, + "step": 24 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.43819578112888324, + "learning_rate": 8.771929824561403e-05, + "loss": 0.8083, + "step": 25 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.5415670158779432, + "learning_rate": 9.12280701754386e-05, + "loss": 0.9781, + "step": 26 + }, + { + "epoch": 0.0144, + "grad_norm": 0.4239940951437761, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8188, + "step": 27 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.5164350404936713, + "learning_rate": 9.824561403508771e-05, + "loss": 0.9685, + "step": 28 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.5372237610117722, + "learning_rate": 0.0001017543859649123, + "loss": 0.976, + "step": 29 + }, + { + "epoch": 0.016, + "grad_norm": 0.5209825847266434, + "learning_rate": 0.00010526315789473685, + "loss": 0.8238, + "step": 30 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.5055305931315417, + "learning_rate": 0.00010877192982456141, + "loss": 0.8523, + "step": 31 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.5877233369375162, + "learning_rate": 0.00011228070175438597, + "loss": 1.002, + "step": 32 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5175698032973742, + "learning_rate": 0.00011578947368421053, + "loss": 0.8617, + "step": 33 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.43747111829307783, + "learning_rate": 0.00011929824561403509, + "loss": 0.8128, + "step": 34 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.49290333964536837, + "learning_rate": 0.00012280701754385965, + "loss": 0.9131, + "step": 35 + }, + { + "epoch": 0.0192, + "grad_norm": 0.46999372987087223, + "learning_rate": 0.0001263157894736842, + "loss": 0.8599, + "step": 36 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.43591327572579525, + "learning_rate": 0.0001298245614035088, + "loss": 0.7806, + "step": 37 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.4548235931595958, + "learning_rate": 0.00013333333333333334, + "loss": 0.8169, + "step": 38 + }, + { + "epoch": 0.0208, + "grad_norm": 0.4883958779391726, + "learning_rate": 0.0001368421052631579, + "loss": 0.887, + "step": 39 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.5010048183676208, + "learning_rate": 0.00014035087719298245, + "loss": 0.9388, + "step": 40 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.42906779053114247, + "learning_rate": 0.00014385964912280703, + "loss": 0.8438, + "step": 41 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4664966442277581, + "learning_rate": 0.00014736842105263158, + "loss": 0.8671, + "step": 42 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.4853123832044512, + "learning_rate": 0.00015087719298245616, + "loss": 0.8684, + "step": 43 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.4303822583017822, + "learning_rate": 0.0001543859649122807, + "loss": 0.7864, + "step": 44 + }, + { + "epoch": 0.024, + "grad_norm": 0.46357800086937717, + "learning_rate": 0.00015789473684210527, + "loss": 0.8621, + "step": 45 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.4546670490950593, + "learning_rate": 0.00016140350877192982, + "loss": 0.8015, + "step": 46 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.44919262163237095, + "learning_rate": 0.0001649122807017544, + "loss": 0.8074, + "step": 47 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4951024350936205, + "learning_rate": 0.00016842105263157895, + "loss": 0.8764, + "step": 48 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.4585135436466692, + "learning_rate": 0.00017192982456140353, + "loss": 0.8491, + "step": 49 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.5448653801108341, + "learning_rate": 0.00017543859649122806, + "loss": 0.9423, + "step": 50 + }, + { + "epoch": 0.0272, + "grad_norm": 0.46467634169454985, + "learning_rate": 0.00017894736842105264, + "loss": 0.8535, + "step": 51 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.4601959184314359, + "learning_rate": 0.0001824561403508772, + "loss": 0.8206, + "step": 52 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.4116027116956907, + "learning_rate": 0.00018596491228070177, + "loss": 0.7936, + "step": 53 + }, + { + "epoch": 0.0288, + "grad_norm": 0.45507183958473724, + "learning_rate": 0.00018947368421052632, + "loss": 0.7862, + "step": 54 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.450173247372, + "learning_rate": 0.00019298245614035088, + "loss": 0.8451, + "step": 55 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.4762619236369063, + "learning_rate": 0.00019649122807017543, + "loss": 0.8945, + "step": 56 + }, + { + "epoch": 0.0304, + "grad_norm": 0.5336252193775527, + "learning_rate": 0.0002, + "loss": 0.9459, + "step": 57 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.4798122373225587, + "learning_rate": 0.00019999985069241055, + "loss": 0.8528, + "step": 58 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.4432560945756295, + "learning_rate": 0.00019999940277008808, + "loss": 0.8344, + "step": 59 + }, + { + "epoch": 0.032, + "grad_norm": 0.447266664393156, + "learning_rate": 0.00019999865623437013, + "loss": 0.8578, + "step": 60 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.429591571022727, + "learning_rate": 0.00019999761108748597, + "loss": 0.7733, + "step": 61 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.45565926368809806, + "learning_rate": 0.00019999626733255662, + "loss": 0.8608, + "step": 62 + }, + { + "epoch": 0.0336, + "grad_norm": 0.4403742976437441, + "learning_rate": 0.00019999462497359466, + "loss": 0.8869, + "step": 63 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.4886789451410554, + "learning_rate": 0.00019999268401550447, + "loss": 0.8765, + "step": 64 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.49902160576321314, + "learning_rate": 0.000199990444464082, + "loss": 0.9175, + "step": 65 + }, + { + "epoch": 0.0352, + "grad_norm": 0.42451260180558853, + "learning_rate": 0.00019998790632601496, + "loss": 0.8445, + "step": 66 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.5036458418688877, + "learning_rate": 0.00019998506960888256, + "loss": 0.8685, + "step": 67 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.5215355346442807, + "learning_rate": 0.00019998193432115572, + "loss": 0.886, + "step": 68 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4810706495362761, + "learning_rate": 0.0001999785004721968, + "loss": 0.83, + "step": 69 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.4752802062241432, + "learning_rate": 0.00019997476807225985, + "loss": 0.8572, + "step": 70 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.48887680784833526, + "learning_rate": 0.0001999707371324904, + "loss": 0.8122, + "step": 71 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4464455656623525, + "learning_rate": 0.00019996640766492543, + "loss": 0.7583, + "step": 72 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.4502376355976084, + "learning_rate": 0.00019996177968249334, + "loss": 0.8145, + "step": 73 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.47371965043639064, + "learning_rate": 0.0001999568531990141, + "loss": 0.8902, + "step": 74 + }, + { + "epoch": 0.04, + "grad_norm": 0.43692843711423124, + "learning_rate": 0.00019995162822919883, + "loss": 0.8395, + "step": 75 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.5391365712735926, + "learning_rate": 0.00019994610478865011, + "loss": 0.9224, + "step": 76 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.4679510818463831, + "learning_rate": 0.0001999402828938618, + "loss": 0.8445, + "step": 77 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4410149228044899, + "learning_rate": 0.00019993416256221895, + "loss": 0.7968, + "step": 78 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.41427675369684946, + "learning_rate": 0.00019992774381199778, + "loss": 0.735, + "step": 79 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.4984554811446533, + "learning_rate": 0.00019992102666236566, + "loss": 0.8434, + "step": 80 + }, + { + "epoch": 0.0432, + "grad_norm": 0.43592960041981665, + "learning_rate": 0.00019991401113338104, + "loss": 0.7589, + "step": 81 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.45766415866525306, + "learning_rate": 0.00019990669724599336, + "loss": 0.827, + "step": 82 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.5162995961208674, + "learning_rate": 0.00019989908502204292, + "loss": 0.8072, + "step": 83 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4737438201236418, + "learning_rate": 0.00019989117448426108, + "loss": 0.8065, + "step": 84 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.5186994360802084, + "learning_rate": 0.00019988296565626987, + "loss": 0.8736, + "step": 85 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.4618359832966522, + "learning_rate": 0.00019987445856258206, + "loss": 0.8005, + "step": 86 + }, + { + "epoch": 0.0464, + "grad_norm": 0.4574079671827261, + "learning_rate": 0.00019986565322860115, + "loss": 0.7259, + "step": 87 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.46175694382654586, + "learning_rate": 0.00019985654968062122, + "loss": 0.85, + "step": 88 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.4291930547499251, + "learning_rate": 0.00019984714794582683, + "loss": 0.7666, + "step": 89 + }, + { + "epoch": 0.048, + "grad_norm": 0.41983085824612487, + "learning_rate": 0.00019983744805229296, + "loss": 0.8257, + "step": 90 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.44957430818236727, + "learning_rate": 0.000199827450028985, + "loss": 0.853, + "step": 91 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.4600564721570993, + "learning_rate": 0.00019981715390575858, + "loss": 0.8087, + "step": 92 + }, + { + "epoch": 0.0496, + "grad_norm": 0.6253489587923583, + "learning_rate": 0.00019980655971335945, + "loss": 0.7901, + "step": 93 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.5281010561079615, + "learning_rate": 0.00019979566748342347, + "loss": 0.7997, + "step": 94 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.45303075638233853, + "learning_rate": 0.00019978447724847652, + "loss": 0.7914, + "step": 95 + }, + { + "epoch": 0.0512, + "grad_norm": 0.41303206939142767, + "learning_rate": 0.00019977298904193437, + "loss": 0.7501, + "step": 96 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.40984017273528417, + "learning_rate": 0.00019976120289810247, + "loss": 0.7245, + "step": 97 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.4650303499074836, + "learning_rate": 0.00019974911885217608, + "loss": 0.858, + "step": 98 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5035130651520404, + "learning_rate": 0.00019973673694024, + "loss": 0.8126, + "step": 99 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.4054722904374148, + "learning_rate": 0.0001997240571992685, + "loss": 0.7733, + "step": 100 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.46034678879795327, + "learning_rate": 0.00019971107966712518, + "loss": 0.7837, + "step": 101 + }, + { + "epoch": 0.0544, + "grad_norm": 0.46416172423665386, + "learning_rate": 0.00019969780438256293, + "loss": 0.8471, + "step": 102 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.39898803977700154, + "learning_rate": 0.0001996842313852238, + "loss": 0.7586, + "step": 103 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.41641998385733875, + "learning_rate": 0.00019967036071563877, + "loss": 0.7471, + "step": 104 + }, + { + "epoch": 0.056, + "grad_norm": 0.4794689638062486, + "learning_rate": 0.0001996561924152278, + "loss": 0.8725, + "step": 105 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.39489918088446685, + "learning_rate": 0.0001996417265262996, + "loss": 0.7614, + "step": 106 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.44357484154729226, + "learning_rate": 0.00019962696309205148, + "loss": 0.8444, + "step": 107 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4255774958424041, + "learning_rate": 0.0001996119021565693, + "loss": 0.7657, + "step": 108 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.4262064191118899, + "learning_rate": 0.0001995965437648273, + "loss": 0.8007, + "step": 109 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.3821179705066038, + "learning_rate": 0.00019958088796268793, + "loss": 0.7376, + "step": 110 + }, + { + "epoch": 0.0592, + "grad_norm": 0.42732811385386976, + "learning_rate": 0.0001995649347969019, + "loss": 0.717, + "step": 111 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.5120400356904402, + "learning_rate": 0.00019954868431510764, + "loss": 0.7882, + "step": 112 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.4868550700656959, + "learning_rate": 0.00019953213656583168, + "loss": 0.8392, + "step": 113 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4184263053969323, + "learning_rate": 0.00019951529159848805, + "loss": 0.7542, + "step": 114 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.6929701997915731, + "learning_rate": 0.00019949814946337838, + "loss": 0.8437, + "step": 115 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.4566990560424279, + "learning_rate": 0.00019948071021169174, + "loss": 0.7528, + "step": 116 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4121649246666184, + "learning_rate": 0.00019946297389550433, + "loss": 0.7822, + "step": 117 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.4253471446520878, + "learning_rate": 0.00019944494056777946, + "loss": 0.8152, + "step": 118 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.45888022790076605, + "learning_rate": 0.00019942661028236745, + "loss": 0.8862, + "step": 119 + }, + { + "epoch": 0.064, + "grad_norm": 0.4636889247958515, + "learning_rate": 0.00019940798309400526, + "loss": 0.802, + "step": 120 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.45048156433075803, + "learning_rate": 0.00019938905905831654, + "loss": 0.8117, + "step": 121 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.4406797785869058, + "learning_rate": 0.00019936983823181132, + "loss": 0.8154, + "step": 122 + }, + { + "epoch": 0.0656, + "grad_norm": 0.4589102632667276, + "learning_rate": 0.0001993503206718859, + "loss": 0.8003, + "step": 123 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.4838485949407181, + "learning_rate": 0.00019933050643682269, + "loss": 0.7771, + "step": 124 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.44507222590884743, + "learning_rate": 0.00019931039558578997, + "loss": 0.8383, + "step": 125 + }, + { + "epoch": 0.0672, + "grad_norm": 0.37569952847305876, + "learning_rate": 0.00019928998817884182, + "loss": 0.785, + "step": 126 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.4117299081017675, + "learning_rate": 0.00019926928427691786, + "loss": 0.7823, + "step": 127 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.44170602136108966, + "learning_rate": 0.00019924828394184306, + "loss": 0.8539, + "step": 128 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4218611277342427, + "learning_rate": 0.00019922698723632767, + "loss": 0.8121, + "step": 129 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.5418620836987449, + "learning_rate": 0.0001992053942239668, + "loss": 0.9242, + "step": 130 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.42686140928716687, + "learning_rate": 0.0001991835049692405, + "loss": 0.779, + "step": 131 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4741185695499851, + "learning_rate": 0.00019916131953751342, + "loss": 0.8416, + "step": 132 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.5113869593445932, + "learning_rate": 0.0001991388379950346, + "loss": 0.8909, + "step": 133 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.44021669828848753, + "learning_rate": 0.0001991160604089374, + "loss": 0.8145, + "step": 134 + }, + { + "epoch": 0.072, + "grad_norm": 0.4331612856730797, + "learning_rate": 0.00019909298684723904, + "loss": 0.8013, + "step": 135 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.42869556780040946, + "learning_rate": 0.00019906961737884077, + "loss": 0.7729, + "step": 136 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.4798784272020571, + "learning_rate": 0.00019904595207352737, + "loss": 0.7671, + "step": 137 + }, + { + "epoch": 0.0736, + "grad_norm": 0.41352724710087835, + "learning_rate": 0.00019902199100196697, + "loss": 0.7767, + "step": 138 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.47713623847853925, + "learning_rate": 0.000198997734235711, + "loss": 0.8163, + "step": 139 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.4256501161580274, + "learning_rate": 0.00019897318184719385, + "loss": 0.7838, + "step": 140 + }, + { + "epoch": 0.0752, + "grad_norm": 0.40691106752334844, + "learning_rate": 0.00019894833390973266, + "loss": 0.7611, + "step": 141 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.43022441599079425, + "learning_rate": 0.0001989231904975272, + "loss": 0.789, + "step": 142 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.45737675791473775, + "learning_rate": 0.00019889775168565943, + "loss": 0.7935, + "step": 143 + }, + { + "epoch": 0.0768, + "grad_norm": 0.40825035185860664, + "learning_rate": 0.00019887201755009357, + "loss": 0.8085, + "step": 144 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.4137346124558555, + "learning_rate": 0.00019884598816767563, + "loss": 0.7492, + "step": 145 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.5480064578296908, + "learning_rate": 0.0001988196636161333, + "loss": 0.8338, + "step": 146 + }, + { + "epoch": 0.0784, + "grad_norm": 0.4811157456470868, + "learning_rate": 0.0001987930439740757, + "loss": 0.8479, + "step": 147 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.42586764755902906, + "learning_rate": 0.00019876612932099308, + "loss": 0.7396, + "step": 148 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.479091998968582, + "learning_rate": 0.0001987389197372567, + "loss": 0.8647, + "step": 149 + }, + { + "epoch": 0.08, + "grad_norm": 0.4386709001907441, + "learning_rate": 0.00019871141530411853, + "loss": 0.7694, + "step": 150 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.4790956554257266, + "learning_rate": 0.00019868361610371097, + "loss": 0.8169, + "step": 151 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.4745228037719035, + "learning_rate": 0.00019865552221904665, + "loss": 0.869, + "step": 152 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5205449581005507, + "learning_rate": 0.0001986271337340182, + "loss": 0.7765, + "step": 153 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.6000799801790433, + "learning_rate": 0.00019859845073339787, + "loss": 0.9171, + "step": 154 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.44788944508580925, + "learning_rate": 0.00019856947330283752, + "loss": 0.8149, + "step": 155 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4157662736567421, + "learning_rate": 0.00019854020152886814, + "loss": 0.7555, + "step": 156 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.3832806174309398, + "learning_rate": 0.0001985106354988997, + "loss": 0.7223, + "step": 157 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.4699115996004792, + "learning_rate": 0.00019848077530122083, + "loss": 0.8032, + "step": 158 + }, + { + "epoch": 0.0848, + "grad_norm": 0.4169777330014803, + "learning_rate": 0.0001984506210249986, + "loss": 0.7244, + "step": 159 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.46018201985666296, + "learning_rate": 0.00019842017276027832, + "loss": 0.8126, + "step": 160 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.4847459842380028, + "learning_rate": 0.00019838943059798304, + "loss": 0.7947, + "step": 161 + }, + { + "epoch": 0.0864, + "grad_norm": 0.3933096013250647, + "learning_rate": 0.00019835839462991361, + "loss": 0.7736, + "step": 162 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.45243818185571316, + "learning_rate": 0.0001983270649487481, + "loss": 0.8184, + "step": 163 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.4503595945797378, + "learning_rate": 0.0001982954416480417, + "loss": 0.8227, + "step": 164 + }, + { + "epoch": 0.088, + "grad_norm": 0.44893678123815417, + "learning_rate": 0.00019826352482222638, + "loss": 0.8695, + "step": 165 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.4213210460142999, + "learning_rate": 0.00019823131456661063, + "loss": 0.825, + "step": 166 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.4426924506344481, + "learning_rate": 0.00019819881097737915, + "loss": 0.7672, + "step": 167 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4869073165660452, + "learning_rate": 0.00019816601415159263, + "loss": 0.8762, + "step": 168 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.40015259673376496, + "learning_rate": 0.00019813292418718732, + "loss": 0.785, + "step": 169 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.46373832017025285, + "learning_rate": 0.0001980995411829749, + "loss": 0.8625, + "step": 170 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4598836311824102, + "learning_rate": 0.0001980658652386421, + "loss": 0.8046, + "step": 171 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.45784081632691526, + "learning_rate": 0.0001980318964547504, + "loss": 0.8193, + "step": 172 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.4392932987043056, + "learning_rate": 0.0001979976349327357, + "loss": 0.8307, + "step": 173 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4463844552503445, + "learning_rate": 0.00019796308077490817, + "loss": 0.7874, + "step": 174 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.4418559505901228, + "learning_rate": 0.00019792823408445174, + "loss": 0.8132, + "step": 175 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.4521536727961204, + "learning_rate": 0.0001978930949654239, + "loss": 0.7608, + "step": 176 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4758650892647243, + "learning_rate": 0.00019785766352275542, + "loss": 0.8387, + "step": 177 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.4768329724525299, + "learning_rate": 0.00019782193986224995, + "loss": 0.8534, + "step": 178 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.42032771227871046, + "learning_rate": 0.00019778592409058378, + "loss": 0.8143, + "step": 179 + }, + { + "epoch": 0.096, + "grad_norm": 0.4144302224539125, + "learning_rate": 0.00019774961631530545, + "loss": 0.807, + "step": 180 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.39784095989327356, + "learning_rate": 0.0001977130166448355, + "loss": 0.7541, + "step": 181 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.42574876432860503, + "learning_rate": 0.00019767612518846608, + "loss": 0.7526, + "step": 182 + }, + { + "epoch": 0.0976, + "grad_norm": 0.5914800123257209, + "learning_rate": 0.00019763894205636072, + "loss": 0.8341, + "step": 183 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.42203359317691, + "learning_rate": 0.00019760146735955388, + "loss": 0.7692, + "step": 184 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.4410858516109663, + "learning_rate": 0.00019756370120995066, + "loss": 0.8106, + "step": 185 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4567139293491327, + "learning_rate": 0.00019752564372032657, + "loss": 0.7652, + "step": 186 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.37682007465653333, + "learning_rate": 0.000197487295004327, + "loss": 0.7596, + "step": 187 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.46226419690327275, + "learning_rate": 0.00019744865517646706, + "loss": 0.7881, + "step": 188 + }, + { + "epoch": 0.1008, + "grad_norm": 0.45747812537419225, + "learning_rate": 0.00019740972435213115, + "loss": 0.8334, + "step": 189 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.40377810796691194, + "learning_rate": 0.0001973705026475726, + "loss": 0.7971, + "step": 190 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.4053289604581338, + "learning_rate": 0.00019733099017991341, + "loss": 0.7793, + "step": 191 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4541349423682215, + "learning_rate": 0.00019729118706714375, + "loss": 0.7613, + "step": 192 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.43739594831129835, + "learning_rate": 0.0001972510934281218, + "loss": 0.8187, + "step": 193 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.41117651532362703, + "learning_rate": 0.00019721070938257324, + "loss": 0.8145, + "step": 194 + }, + { + "epoch": 0.104, + "grad_norm": 0.4432280712699174, + "learning_rate": 0.00019717003505109095, + "loss": 0.7855, + "step": 195 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.40787314100649424, + "learning_rate": 0.0001971290705551347, + "loss": 0.8185, + "step": 196 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.35008609049841144, + "learning_rate": 0.00019708781601703065, + "loss": 0.748, + "step": 197 + }, + { + "epoch": 0.1056, + "grad_norm": 0.45400474655538375, + "learning_rate": 0.00019704627155997108, + "loss": 0.8242, + "step": 198 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.4831309851229042, + "learning_rate": 0.00019700443730801413, + "loss": 0.827, + "step": 199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.44584475765875253, + "learning_rate": 0.00019696231338608316, + "loss": 0.8331, + "step": 200 + }, + { + "epoch": 0.1072, + "grad_norm": 0.45461271032216954, + "learning_rate": 0.00019691989991996663, + "loss": 0.7473, + "step": 201 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.467815209535859, + "learning_rate": 0.00019687719703631755, + "loss": 0.8023, + "step": 202 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.4936230902501373, + "learning_rate": 0.00019683420486265327, + "loss": 0.876, + "step": 203 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4465659727029938, + "learning_rate": 0.0001967909235273549, + "loss": 0.7856, + "step": 204 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.42387291428457186, + "learning_rate": 0.0001967473531596671, + "loss": 0.8244, + "step": 205 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.4481368721925528, + "learning_rate": 0.0001967034938896976, + "loss": 0.7337, + "step": 206 + }, + { + "epoch": 0.1104, + "grad_norm": 0.4274311370798581, + "learning_rate": 0.00019665934584841682, + "loss": 0.7881, + "step": 207 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.4391148343206368, + "learning_rate": 0.0001966149091676575, + "loss": 0.8075, + "step": 208 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.49054062967453904, + "learning_rate": 0.00019657018398011434, + "loss": 0.8624, + "step": 209 + }, + { + "epoch": 0.112, + "grad_norm": 0.5772965435325145, + "learning_rate": 0.00019652517041934356, + "loss": 0.7413, + "step": 210 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.4477575306069233, + "learning_rate": 0.00019647986861976246, + "loss": 0.7927, + "step": 211 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.4780165280100849, + "learning_rate": 0.0001964342787166491, + "loss": 0.829, + "step": 212 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4072145848307551, + "learning_rate": 0.00019638840084614182, + "loss": 0.7687, + "step": 213 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.4287471275372935, + "learning_rate": 0.0001963422351452389, + "loss": 0.7509, + "step": 214 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.495437619970543, + "learning_rate": 0.0001962957817517982, + "loss": 0.9653, + "step": 215 + }, + { + "epoch": 0.1152, + "grad_norm": 0.40277028765977974, + "learning_rate": 0.00019624904080453655, + "loss": 0.786, + "step": 216 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.4181818299463913, + "learning_rate": 0.00019620201244302952, + "loss": 0.8017, + "step": 217 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.5017230228205042, + "learning_rate": 0.00019615469680771096, + "loss": 0.8789, + "step": 218 + }, + { + "epoch": 0.1168, + "grad_norm": 0.4944205151128549, + "learning_rate": 0.00019610709403987246, + "loss": 0.8088, + "step": 219 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.43279911669567134, + "learning_rate": 0.00019605920428166323, + "loss": 0.8379, + "step": 220 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.4185114711196277, + "learning_rate": 0.00019601102767608923, + "loss": 0.738, + "step": 221 + }, + { + "epoch": 0.1184, + "grad_norm": 0.48305042625641237, + "learning_rate": 0.00019596256436701324, + "loss": 0.8525, + "step": 222 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.40438229405832726, + "learning_rate": 0.00019591381449915397, + "loss": 0.7681, + "step": 223 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.46405607397443416, + "learning_rate": 0.00019586477821808597, + "loss": 0.8542, + "step": 224 + }, + { + "epoch": 0.12, + "grad_norm": 0.4112789795870178, + "learning_rate": 0.000195815455670239, + "loss": 0.7951, + "step": 225 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.45655927001958874, + "learning_rate": 0.00019576584700289768, + "loss": 0.7571, + "step": 226 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.4656737851703908, + "learning_rate": 0.00019571595236420102, + "loss": 0.8518, + "step": 227 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4387941097031272, + "learning_rate": 0.00019566577190314197, + "loss": 0.7522, + "step": 228 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.40736854091545094, + "learning_rate": 0.00019561530576956703, + "loss": 0.7703, + "step": 229 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.4359474487913427, + "learning_rate": 0.00019556455411417573, + "loss": 0.8037, + "step": 230 + }, + { + "epoch": 0.1232, + "grad_norm": 0.3881636760799097, + "learning_rate": 0.0001955135170885202, + "loss": 0.6854, + "step": 231 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.43904303453450677, + "learning_rate": 0.00019546219484500475, + "loss": 0.7743, + "step": 232 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.41029410599851096, + "learning_rate": 0.00019541058753688538, + "loss": 0.7017, + "step": 233 + }, + { + "epoch": 0.1248, + "grad_norm": 0.43878985252506475, + "learning_rate": 0.00019535869531826937, + "loss": 0.7386, + "step": 234 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.4732376116029275, + "learning_rate": 0.00019530651834411474, + "loss": 0.7848, + "step": 235 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.42511634202293586, + "learning_rate": 0.00019525405677022989, + "loss": 0.7902, + "step": 236 + }, + { + "epoch": 0.1264, + "grad_norm": 0.4196252128724456, + "learning_rate": 0.00019520131075327298, + "loss": 0.7847, + "step": 237 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.48381972568587917, + "learning_rate": 0.0001951482804507517, + "loss": 0.7752, + "step": 238 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.42465414849497046, + "learning_rate": 0.00019509496602102252, + "loss": 0.7862, + "step": 239 + }, + { + "epoch": 0.128, + "grad_norm": 0.45180889888062, + "learning_rate": 0.00019504136762329047, + "loss": 0.8044, + "step": 240 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.44489280485939253, + "learning_rate": 0.00019498748541760846, + "loss": 0.8037, + "step": 241 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.40437378265677226, + "learning_rate": 0.0001949333195648769, + "loss": 0.718, + "step": 242 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4109141127236639, + "learning_rate": 0.00019487887022684336, + "loss": 0.777, + "step": 243 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.4707454791021639, + "learning_rate": 0.00019482413756610173, + "loss": 0.8686, + "step": 244 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.47712190941329363, + "learning_rate": 0.0001947691217460921, + "loss": 0.8097, + "step": 245 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3992104315095112, + "learning_rate": 0.00019471382293110003, + "loss": 0.7352, + "step": 246 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.4137650229412225, + "learning_rate": 0.00019465824128625617, + "loss": 0.7848, + "step": 247 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.3861068942107445, + "learning_rate": 0.00019460237697753577, + "loss": 0.754, + "step": 248 + }, + { + "epoch": 0.1328, + "grad_norm": 0.42687614828833265, + "learning_rate": 0.00019454623017175812, + "loss": 0.8449, + "step": 249 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.4772395079126402, + "learning_rate": 0.00019448980103658613, + "loss": 0.8213, + "step": 250 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.4561418527462904, + "learning_rate": 0.0001944330897405257, + "loss": 0.764, + "step": 251 + }, + { + "epoch": 0.1344, + "grad_norm": 0.45324284614374627, + "learning_rate": 0.00019437609645292546, + "loss": 0.8145, + "step": 252 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.42990871413569814, + "learning_rate": 0.00019431882134397598, + "loss": 0.757, + "step": 253 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.4242442269517252, + "learning_rate": 0.00019426126458470936, + "loss": 0.758, + "step": 254 + }, + { + "epoch": 0.136, + "grad_norm": 0.4961090507914223, + "learning_rate": 0.0001942034263469989, + "loss": 0.9201, + "step": 255 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.4388177110934772, + "learning_rate": 0.00019414530680355837, + "loss": 0.7998, + "step": 256 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.4334726427754465, + "learning_rate": 0.00019408690612794148, + "loss": 0.7472, + "step": 257 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5245679474958836, + "learning_rate": 0.00019402822449454153, + "loss": 0.7815, + "step": 258 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.3984525106489841, + "learning_rate": 0.00019396926207859084, + "loss": 0.7792, + "step": 259 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.5470105815848786, + "learning_rate": 0.0001939100190561601, + "loss": 0.7804, + "step": 260 + }, + { + "epoch": 0.1392, + "grad_norm": 0.4190062148806821, + "learning_rate": 0.00019385049560415794, + "loss": 0.7437, + "step": 261 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.4280626833445742, + "learning_rate": 0.0001937906919003304, + "loss": 0.7799, + "step": 262 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.40012436181706396, + "learning_rate": 0.00019373060812326052, + "loss": 0.7688, + "step": 263 + }, + { + "epoch": 0.1408, + "grad_norm": 0.45684070383709646, + "learning_rate": 0.00019367024445236754, + "loss": 0.8578, + "step": 264 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.4242078593000147, + "learning_rate": 0.00019360960106790643, + "loss": 0.8441, + "step": 265 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.44327384760435773, + "learning_rate": 0.0001935486781509677, + "loss": 0.791, + "step": 266 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4129859009310087, + "learning_rate": 0.00019348747588347637, + "loss": 0.7783, + "step": 267 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.46098278051698005, + "learning_rate": 0.00019342599444819168, + "loss": 0.8192, + "step": 268 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.43637705533574567, + "learning_rate": 0.00019336423402870653, + "loss": 0.8436, + "step": 269 + }, + { + "epoch": 0.144, + "grad_norm": 0.38549987157479443, + "learning_rate": 0.00019330219480944694, + "loss": 0.6746, + "step": 270 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.3686476192402348, + "learning_rate": 0.0001932398769756714, + "loss": 0.7152, + "step": 271 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.4476828610032896, + "learning_rate": 0.0001931772807134704, + "loss": 0.7038, + "step": 272 + }, + { + "epoch": 0.1456, + "grad_norm": 0.41171180003013674, + "learning_rate": 0.00019311440620976597, + "loss": 0.7968, + "step": 273 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.402059085652968, + "learning_rate": 0.00019305125365231084, + "loss": 0.7333, + "step": 274 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.4056707425304524, + "learning_rate": 0.00019298782322968815, + "loss": 0.7333, + "step": 275 + }, + { + "epoch": 0.1472, + "grad_norm": 0.43689537157450414, + "learning_rate": 0.0001929241151313108, + "loss": 0.8249, + "step": 276 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.4538939992672516, + "learning_rate": 0.0001928601295474208, + "loss": 0.7963, + "step": 277 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.40284862318826553, + "learning_rate": 0.00019279586666908884, + "loss": 0.7815, + "step": 278 + }, + { + "epoch": 0.1488, + "grad_norm": 0.4216230725957816, + "learning_rate": 0.00019273132668821364, + "loss": 0.7518, + "step": 279 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.3900139810053542, + "learning_rate": 0.00019266650979752136, + "loss": 0.7589, + "step": 280 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.4149093035848671, + "learning_rate": 0.00019260141619056507, + "loss": 0.7943, + "step": 281 + }, + { + "epoch": 0.1504, + "grad_norm": 0.48297996377698577, + "learning_rate": 0.00019253604606172417, + "loss": 0.9175, + "step": 282 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.4351076487672295, + "learning_rate": 0.0001924703996062038, + "loss": 0.8047, + "step": 283 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.39083187688691845, + "learning_rate": 0.0001924044770200342, + "loss": 0.742, + "step": 284 + }, + { + "epoch": 0.152, + "grad_norm": 0.48315077907731574, + "learning_rate": 0.00019233827850007027, + "loss": 0.8193, + "step": 285 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.46103679574880274, + "learning_rate": 0.0001922718042439908, + "loss": 0.8755, + "step": 286 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.43190474218702174, + "learning_rate": 0.000192205054450298, + "loss": 0.7402, + "step": 287 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4417586707445263, + "learning_rate": 0.00019213802931831696, + "loss": 0.825, + "step": 288 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.5454072533865187, + "learning_rate": 0.00019207072904819486, + "loss": 0.8484, + "step": 289 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.45900909358959874, + "learning_rate": 0.00019200315384090044, + "loss": 0.8793, + "step": 290 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4418764722298391, + "learning_rate": 0.00019193530389822363, + "loss": 0.7728, + "step": 291 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.41682311824111307, + "learning_rate": 0.00019186717942277462, + "loss": 0.8164, + "step": 292 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.4920244659928006, + "learning_rate": 0.00019179878061798347, + "loss": 0.8679, + "step": 293 + }, + { + "epoch": 0.1568, + "grad_norm": 0.36330437546048105, + "learning_rate": 0.00019173010768809933, + "loss": 0.692, + "step": 294 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.4142764492365718, + "learning_rate": 0.00019166116083819002, + "loss": 0.8216, + "step": 295 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.498936164950068, + "learning_rate": 0.00019159194027414128, + "loss": 0.78, + "step": 296 + }, + { + "epoch": 0.1584, + "grad_norm": 0.42441210039175953, + "learning_rate": 0.0001915224462026563, + "loss": 0.8191, + "step": 297 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.42509304195789493, + "learning_rate": 0.00019145267883125482, + "loss": 0.7581, + "step": 298 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.4267126964232961, + "learning_rate": 0.00019138263836827288, + "loss": 0.8104, + "step": 299 + }, + { + "epoch": 0.16, + "grad_norm": 0.4226704907969013, + "learning_rate": 0.00019131232502286188, + "loss": 0.8098, + "step": 300 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.4798403273069166, + "learning_rate": 0.00019124173900498818, + "loss": 0.8074, + "step": 301 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.44649240559336156, + "learning_rate": 0.00019117088052543233, + "loss": 0.7971, + "step": 302 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4961604744806426, + "learning_rate": 0.0001910997497957885, + "loss": 0.8425, + "step": 303 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.35220160746231455, + "learning_rate": 0.00019102834702846387, + "loss": 0.7055, + "step": 304 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.43901112282862115, + "learning_rate": 0.0001909566724366779, + "loss": 0.7687, + "step": 305 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4341322213872506, + "learning_rate": 0.00019088472623446183, + "loss": 0.7817, + "step": 306 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.39210795988100916, + "learning_rate": 0.00019081250863665794, + "loss": 0.7374, + "step": 307 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.4249436264642188, + "learning_rate": 0.0001907400198589189, + "loss": 0.8085, + "step": 308 + }, + { + "epoch": 0.1648, + "grad_norm": 0.40231474289508434, + "learning_rate": 0.00019066726011770726, + "loss": 0.7013, + "step": 309 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.46152241459367194, + "learning_rate": 0.00019059422963029464, + "loss": 0.7498, + "step": 310 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.44690053275739583, + "learning_rate": 0.0001905209286147611, + "loss": 0.7858, + "step": 311 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4569472159606826, + "learning_rate": 0.0001904473572899947, + "loss": 0.8488, + "step": 312 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.3862654675484178, + "learning_rate": 0.0001903735158756905, + "loss": 0.7148, + "step": 313 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.4469532424952468, + "learning_rate": 0.0001902994045923502, + "loss": 0.9058, + "step": 314 + }, + { + "epoch": 0.168, + "grad_norm": 0.41512658359168486, + "learning_rate": 0.00019022502366128135, + "loss": 0.7564, + "step": 315 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.43849788868737033, + "learning_rate": 0.0001901503733045967, + "loss": 0.8146, + "step": 316 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.42578035574036255, + "learning_rate": 0.00019007545374521355, + "loss": 0.7647, + "step": 317 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4287376466249776, + "learning_rate": 0.00019000026520685302, + "loss": 0.7968, + "step": 318 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.3998036758901615, + "learning_rate": 0.00018992480791403958, + "loss": 0.7385, + "step": 319 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.43530756222877093, + "learning_rate": 0.0001898490820921001, + "loss": 0.7785, + "step": 320 + }, + { + "epoch": 0.1712, + "grad_norm": 0.44817237110630764, + "learning_rate": 0.0001897730879671634, + "loss": 0.7894, + "step": 321 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.4045125640797333, + "learning_rate": 0.0001896968257661595, + "loss": 0.7022, + "step": 322 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.38106638033280676, + "learning_rate": 0.00018962029571681886, + "loss": 0.7275, + "step": 323 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4322287827747288, + "learning_rate": 0.00018954349804767184, + "loss": 0.7832, + "step": 324 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.3840790514719407, + "learning_rate": 0.00018946643298804793, + "loss": 0.7287, + "step": 325 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.4923056202155615, + "learning_rate": 0.00018938910076807513, + "loss": 0.8511, + "step": 326 + }, + { + "epoch": 0.1744, + "grad_norm": 0.38688686764024316, + "learning_rate": 0.00018931150161867916, + "loss": 0.6976, + "step": 327 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.4228516097817739, + "learning_rate": 0.0001892336357715829, + "loss": 0.7898, + "step": 328 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.3591065525009827, + "learning_rate": 0.0001891555034593055, + "loss": 0.6503, + "step": 329 + }, + { + "epoch": 0.176, + "grad_norm": 0.37535288131691275, + "learning_rate": 0.00018907710491516199, + "loss": 0.7257, + "step": 330 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.4189502201973536, + "learning_rate": 0.00018899844037326225, + "loss": 0.8156, + "step": 331 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.41085459937546664, + "learning_rate": 0.0001889195100685106, + "loss": 0.7641, + "step": 332 + }, + { + "epoch": 0.1776, + "grad_norm": 0.41399903331638765, + "learning_rate": 0.0001888403142366049, + "loss": 0.7879, + "step": 333 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.42894816323716317, + "learning_rate": 0.00018876085311403593, + "loss": 0.8017, + "step": 334 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.49675744246390297, + "learning_rate": 0.00018868112693808665, + "loss": 0.8291, + "step": 335 + }, + { + "epoch": 0.1792, + "grad_norm": 0.460752202699587, + "learning_rate": 0.00018860113594683148, + "loss": 0.8788, + "step": 336 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.4637953576214901, + "learning_rate": 0.00018852088037913577, + "loss": 0.8106, + "step": 337 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.4201668333522856, + "learning_rate": 0.0001884403604746547, + "loss": 0.773, + "step": 338 + }, + { + "epoch": 0.1808, + "grad_norm": 0.43774102524247854, + "learning_rate": 0.00018835957647383303, + "loss": 0.8039, + "step": 339 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.3587546918505062, + "learning_rate": 0.00018827852861790398, + "loss": 0.705, + "step": 340 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.4126796284740049, + "learning_rate": 0.00018819721714888877, + "loss": 0.7536, + "step": 341 + }, + { + "epoch": 0.1824, + "grad_norm": 0.39217298838322884, + "learning_rate": 0.00018811564230959588, + "loss": 0.7286, + "step": 342 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.4216981617215113, + "learning_rate": 0.00018803380434362, + "loss": 0.7836, + "step": 343 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.427309007762194, + "learning_rate": 0.0001879517034953418, + "loss": 0.8332, + "step": 344 + }, + { + "epoch": 0.184, + "grad_norm": 0.4393362068035181, + "learning_rate": 0.00018786934000992688, + "loss": 0.7638, + "step": 345 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.4563013922481514, + "learning_rate": 0.00018778671413332513, + "loss": 0.8204, + "step": 346 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.4069115922933758, + "learning_rate": 0.00018770382611226987, + "loss": 0.8024, + "step": 347 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4019532370457461, + "learning_rate": 0.00018762067619427746, + "loss": 0.7084, + "step": 348 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.399054521750121, + "learning_rate": 0.000187537264627646, + "loss": 0.6556, + "step": 349 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.41008227332881475, + "learning_rate": 0.00018745359166145523, + "loss": 0.7185, + "step": 350 + }, + { + "epoch": 0.1872, + "grad_norm": 0.41658475803580297, + "learning_rate": 0.00018736965754556528, + "loss": 0.7656, + "step": 351 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.4615586902622352, + "learning_rate": 0.00018728546253061614, + "loss": 0.7824, + "step": 352 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.5238526606339945, + "learning_rate": 0.00018720100686802694, + "loss": 0.8099, + "step": 353 + }, + { + "epoch": 0.1888, + "grad_norm": 0.42702857946584744, + "learning_rate": 0.00018711629080999504, + "loss": 0.7898, + "step": 354 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.4866882638590615, + "learning_rate": 0.00018703131460949554, + "loss": 0.8483, + "step": 355 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.4276742751000265, + "learning_rate": 0.0001869460785202802, + "loss": 0.7809, + "step": 356 + }, + { + "epoch": 0.1904, + "grad_norm": 0.4100965474634774, + "learning_rate": 0.00018686058279687698, + "loss": 0.7363, + "step": 357 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.4664754062600203, + "learning_rate": 0.00018677482769458904, + "loss": 0.7483, + "step": 358 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.37361963672851983, + "learning_rate": 0.00018668881346949417, + "loss": 0.717, + "step": 359 + }, + { + "epoch": 0.192, + "grad_norm": 0.43233657296858197, + "learning_rate": 0.00018660254037844388, + "loss": 0.7615, + "step": 360 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.44458200899568584, + "learning_rate": 0.00018651600867906272, + "loss": 0.7609, + "step": 361 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.4048535668894861, + "learning_rate": 0.00018642921862974742, + "loss": 0.7531, + "step": 362 + }, + { + "epoch": 0.1936, + "grad_norm": 0.4045867025921511, + "learning_rate": 0.00018634217048966637, + "loss": 0.757, + "step": 363 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.4605392568978197, + "learning_rate": 0.00018625486451875843, + "loss": 0.8182, + "step": 364 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.4769558757285831, + "learning_rate": 0.0001861673009777325, + "loss": 0.8203, + "step": 365 + }, + { + "epoch": 0.1952, + "grad_norm": 0.40387606807518167, + "learning_rate": 0.0001860794801280666, + "loss": 0.7414, + "step": 366 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.4492733554879574, + "learning_rate": 0.00018599140223200716, + "loss": 0.7475, + "step": 367 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.4760445921931973, + "learning_rate": 0.0001859030675525681, + "loss": 0.8217, + "step": 368 + }, + { + "epoch": 0.1968, + "grad_norm": 0.4466095157119117, + "learning_rate": 0.0001858144763535302, + "loss": 0.821, + "step": 369 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.36784926334970397, + "learning_rate": 0.0001857256288994402, + "loss": 0.708, + "step": 370 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.4664739978613938, + "learning_rate": 0.00018563652545561013, + "loss": 0.7819, + "step": 371 + }, + { + "epoch": 0.1984, + "grad_norm": 0.48964202895281556, + "learning_rate": 0.0001855471662881164, + "loss": 0.8729, + "step": 372 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.40185566940796785, + "learning_rate": 0.000185457551663799, + "loss": 0.767, + "step": 373 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.43300712005598424, + "learning_rate": 0.00018536768185026083, + "loss": 0.8244, + "step": 374 + }, + { + "epoch": 0.2, + "grad_norm": 0.4275700118931833, + "learning_rate": 0.00018527755711586678, + "loss": 0.8337, + "step": 375 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.4206666916148894, + "learning_rate": 0.00018518717772974302, + "loss": 0.7714, + "step": 376 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.4836231192137557, + "learning_rate": 0.00018509654396177609, + "loss": 0.7585, + "step": 377 + }, + { + "epoch": 0.2016, + "grad_norm": 0.42887079426279345, + "learning_rate": 0.00018500565608261214, + "loss": 0.7611, + "step": 378 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.3923492742652943, + "learning_rate": 0.00018491451436365627, + "loss": 0.7655, + "step": 379 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.37381754593752975, + "learning_rate": 0.0001848231190770714, + "loss": 0.7016, + "step": 380 + }, + { + "epoch": 0.2032, + "grad_norm": 0.45699872442911477, + "learning_rate": 0.00018473147049577774, + "loss": 0.7581, + "step": 381 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.4103872069758534, + "learning_rate": 0.00018463956889345194, + "loss": 0.7452, + "step": 382 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.4019382931211576, + "learning_rate": 0.00018454741454452603, + "loss": 0.7456, + "step": 383 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4308887476747637, + "learning_rate": 0.00018445500772418697, + "loss": 0.7786, + "step": 384 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.4349446345312603, + "learning_rate": 0.00018436234870837547, + "loss": 0.8067, + "step": 385 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.4169251670596901, + "learning_rate": 0.00018426943777378552, + "loss": 0.7929, + "step": 386 + }, + { + "epoch": 0.2064, + "grad_norm": 0.4264377711670068, + "learning_rate": 0.00018417627519786315, + "loss": 0.7701, + "step": 387 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.43985200886507086, + "learning_rate": 0.00018408286125880604, + "loss": 0.7834, + "step": 388 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.4209543131074095, + "learning_rate": 0.00018398919623556238, + "loss": 0.8136, + "step": 389 + }, + { + "epoch": 0.208, + "grad_norm": 0.36892798632147605, + "learning_rate": 0.00018389528040783012, + "loss": 0.7237, + "step": 390 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.4567328302763399, + "learning_rate": 0.0001838011140560562, + "loss": 0.7535, + "step": 391 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.42234184406921543, + "learning_rate": 0.00018370669746143564, + "loss": 0.7248, + "step": 392 + }, + { + "epoch": 0.2096, + "grad_norm": 0.41738156945110905, + "learning_rate": 0.00018361203090591071, + "loss": 0.7513, + "step": 393 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.41387492453563235, + "learning_rate": 0.0001835171146721701, + "loss": 0.762, + "step": 394 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.40940820687224067, + "learning_rate": 0.00018342194904364813, + "loss": 0.7426, + "step": 395 + }, + { + "epoch": 0.2112, + "grad_norm": 0.41896736068507107, + "learning_rate": 0.00018332653430452376, + "loss": 0.7974, + "step": 396 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.42726819872348015, + "learning_rate": 0.00018323087073971993, + "loss": 0.7633, + "step": 397 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.483129624128463, + "learning_rate": 0.00018313495863490258, + "loss": 0.7791, + "step": 398 + }, + { + "epoch": 0.2128, + "grad_norm": 0.42936893725682196, + "learning_rate": 0.00018303879827647975, + "loss": 0.6965, + "step": 399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.49994335105567145, + "learning_rate": 0.00018294238995160094, + "loss": 0.7847, + "step": 400 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.42322594921195633, + "learning_rate": 0.00018284573394815597, + "loss": 0.7118, + "step": 401 + }, + { + "epoch": 0.2144, + "grad_norm": 0.48254626226918723, + "learning_rate": 0.00018274883055477436, + "loss": 0.829, + "step": 402 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.39378361115420063, + "learning_rate": 0.00018265168006082437, + "loss": 0.6769, + "step": 403 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.47535999469541607, + "learning_rate": 0.00018255428275641214, + "loss": 0.768, + "step": 404 + }, + { + "epoch": 0.216, + "grad_norm": 0.41693174477042727, + "learning_rate": 0.00018245663893238075, + "loss": 0.7807, + "step": 405 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.48129339335753435, + "learning_rate": 0.0001823587488803095, + "loss": 0.8272, + "step": 406 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.41287072690689025, + "learning_rate": 0.00018226061289251298, + "loss": 0.7748, + "step": 407 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4398542226304193, + "learning_rate": 0.00018216223126204007, + "loss": 0.7408, + "step": 408 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.41730831330764306, + "learning_rate": 0.00018206360428267332, + "loss": 0.7336, + "step": 409 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.421603882985244, + "learning_rate": 0.00018196473224892784, + "loss": 0.7724, + "step": 410 + }, + { + "epoch": 0.2192, + "grad_norm": 0.4283142935953124, + "learning_rate": 0.00018186561545605054, + "loss": 0.7544, + "step": 411 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.44427983108831265, + "learning_rate": 0.0001817662542000192, + "loss": 0.7871, + "step": 412 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.4638134985671546, + "learning_rate": 0.0001816666487775416, + "loss": 0.7739, + "step": 413 + }, + { + "epoch": 0.2208, + "grad_norm": 0.41864721891489537, + "learning_rate": 0.00018156679948605467, + "loss": 0.763, + "step": 414 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.3889516114988014, + "learning_rate": 0.00018146670662372354, + "loss": 0.7374, + "step": 415 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.42782093903969304, + "learning_rate": 0.0001813663704894407, + "loss": 0.7829, + "step": 416 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4301643227706815, + "learning_rate": 0.00018126579138282503, + "loss": 0.743, + "step": 417 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.3965266913429562, + "learning_rate": 0.00018116496960422107, + "loss": 0.7429, + "step": 418 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.43728064114296455, + "learning_rate": 0.00018106390545469795, + "loss": 0.7701, + "step": 419 + }, + { + "epoch": 0.224, + "grad_norm": 0.3620570638238654, + "learning_rate": 0.0001809625992360485, + "loss": 0.6786, + "step": 420 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.45283954918424174, + "learning_rate": 0.00018086105125078857, + "loss": 0.8196, + "step": 421 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.4343397005833213, + "learning_rate": 0.00018075926180215576, + "loss": 0.7969, + "step": 422 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4079287424387631, + "learning_rate": 0.00018065723119410884, + "loss": 0.7261, + "step": 423 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.4222242988822073, + "learning_rate": 0.0001805549597313267, + "loss": 0.8046, + "step": 424 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.4293398231847534, + "learning_rate": 0.0001804524477192075, + "loss": 0.7713, + "step": 425 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4798233731304898, + "learning_rate": 0.00018034969546386757, + "loss": 0.8631, + "step": 426 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.37203957607612326, + "learning_rate": 0.00018024670327214084, + "loss": 0.6513, + "step": 427 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.38700273162228754, + "learning_rate": 0.00018014347145157755, + "loss": 0.7383, + "step": 428 + }, + { + "epoch": 0.2288, + "grad_norm": 0.39274314684924067, + "learning_rate": 0.0001800400003104436, + "loss": 0.7127, + "step": 429 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.4220019961457572, + "learning_rate": 0.0001799362901577196, + "loss": 0.7666, + "step": 430 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.40725013616980993, + "learning_rate": 0.00017983234130309968, + "loss": 0.7364, + "step": 431 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4210188085980111, + "learning_rate": 0.00017972815405699103, + "loss": 0.7196, + "step": 432 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.38157409826429195, + "learning_rate": 0.00017962372873051252, + "loss": 0.7486, + "step": 433 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.4619994851529084, + "learning_rate": 0.00017951906563549397, + "loss": 0.8306, + "step": 434 + }, + { + "epoch": 0.232, + "grad_norm": 0.3870858054183792, + "learning_rate": 0.00017941416508447536, + "loss": 0.7092, + "step": 435 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.44663797567474073, + "learning_rate": 0.00017930902739070562, + "loss": 0.8126, + "step": 436 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.43388290831328646, + "learning_rate": 0.00017920365286814183, + "loss": 0.7088, + "step": 437 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4699874469397872, + "learning_rate": 0.0001790980418314484, + "loss": 0.7455, + "step": 438 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.4404749844802004, + "learning_rate": 0.0001789921945959958, + "loss": 0.7763, + "step": 439 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.42788428607178386, + "learning_rate": 0.00017888611147786002, + "loss": 0.7964, + "step": 440 + }, + { + "epoch": 0.2352, + "grad_norm": 0.3989626138037562, + "learning_rate": 0.00017877979279382135, + "loss": 0.7574, + "step": 441 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.39554981991833266, + "learning_rate": 0.00017867323886136348, + "loss": 0.7237, + "step": 442 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.38639141646450237, + "learning_rate": 0.00017856644999867264, + "loss": 0.6986, + "step": 443 + }, + { + "epoch": 0.2368, + "grad_norm": 0.44363977875016986, + "learning_rate": 0.0001784594265246366, + "loss": 0.8224, + "step": 444 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.39983749874740654, + "learning_rate": 0.00017835216875884368, + "loss": 0.7154, + "step": 445 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.39074455401397934, + "learning_rate": 0.0001782446770215819, + "loss": 0.6708, + "step": 446 + }, + { + "epoch": 0.2384, + "grad_norm": 0.4382862374773125, + "learning_rate": 0.0001781369516338378, + "loss": 0.7888, + "step": 447 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.45623092899339585, + "learning_rate": 0.00017802899291729585, + "loss": 0.7406, + "step": 448 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.35555959303217444, + "learning_rate": 0.0001779208011943371, + "loss": 0.6977, + "step": 449 + }, + { + "epoch": 0.24, + "grad_norm": 0.3764169462428527, + "learning_rate": 0.00017781237678803847, + "loss": 0.7668, + "step": 450 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.44188810666729844, + "learning_rate": 0.00017770372002217172, + "loss": 0.7776, + "step": 451 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.4061500921489558, + "learning_rate": 0.00017759483122120238, + "loss": 0.7578, + "step": 452 + }, + { + "epoch": 0.2416, + "grad_norm": 0.38125870005018797, + "learning_rate": 0.000177485710710289, + "loss": 0.6708, + "step": 453 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.3786000392085864, + "learning_rate": 0.00017737635881528196, + "loss": 0.743, + "step": 454 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.44304176475420287, + "learning_rate": 0.00017726677586272263, + "loss": 0.7115, + "step": 455 + }, + { + "epoch": 0.2432, + "grad_norm": 0.44540392013839997, + "learning_rate": 0.00017715696217984235, + "loss": 0.855, + "step": 456 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.35413155866160545, + "learning_rate": 0.00017704691809456143, + "loss": 0.6517, + "step": 457 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.4198272028109067, + "learning_rate": 0.0001769366439354882, + "loss": 0.7056, + "step": 458 + }, + { + "epoch": 0.2448, + "grad_norm": 0.45676985184117036, + "learning_rate": 0.00017682614003191807, + "loss": 0.7924, + "step": 459 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.38917203783210363, + "learning_rate": 0.00017671540671383243, + "loss": 0.6827, + "step": 460 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.4621210509235434, + "learning_rate": 0.0001766044443118978, + "loss": 0.8037, + "step": 461 + }, + { + "epoch": 0.2464, + "grad_norm": 0.39189614997837624, + "learning_rate": 0.00017649325315746478, + "loss": 0.6711, + "step": 462 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.42029604735547044, + "learning_rate": 0.00017638183358256696, + "loss": 0.7949, + "step": 463 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.38936531570922495, + "learning_rate": 0.00017627018591992018, + "loss": 0.7519, + "step": 464 + }, + { + "epoch": 0.248, + "grad_norm": 0.378696399369877, + "learning_rate": 0.0001761583105029213, + "loss": 0.7308, + "step": 465 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.39504712205717357, + "learning_rate": 0.00017604620766564723, + "loss": 0.7411, + "step": 466 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.3718348859138365, + "learning_rate": 0.00017593387774285412, + "loss": 0.7069, + "step": 467 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3962793562795794, + "learning_rate": 0.00017582132106997616, + "loss": 0.7673, + "step": 468 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.37748344421953184, + "learning_rate": 0.0001757085379831246, + "loss": 0.7166, + "step": 469 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.44195969497521115, + "learning_rate": 0.00017559552881908695, + "loss": 0.7465, + "step": 470 + }, + { + "epoch": 0.2512, + "grad_norm": 0.46254545669392455, + "learning_rate": 0.00017548229391532572, + "loss": 0.7834, + "step": 471 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.40132248932853, + "learning_rate": 0.00017536883360997743, + "loss": 0.6922, + "step": 472 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.45874268019124786, + "learning_rate": 0.00017525514824185185, + "loss": 0.7523, + "step": 473 + }, + { + "epoch": 0.2528, + "grad_norm": 0.469745709659557, + "learning_rate": 0.00017514123815043074, + "loss": 0.756, + "step": 474 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.3987742896758139, + "learning_rate": 0.00017502710367586687, + "loss": 0.7892, + "step": 475 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.41833025224807635, + "learning_rate": 0.0001749127451589832, + "loss": 0.7323, + "step": 476 + }, + { + "epoch": 0.2544, + "grad_norm": 0.45771691916950347, + "learning_rate": 0.00017479816294127152, + "loss": 0.8732, + "step": 477 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.4326273075120264, + "learning_rate": 0.00017468335736489177, + "loss": 0.7967, + "step": 478 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.42879572658964055, + "learning_rate": 0.00017456832877267084, + "loss": 0.7728, + "step": 479 + }, + { + "epoch": 0.256, + "grad_norm": 0.42443080229742036, + "learning_rate": 0.0001744530775081015, + "loss": 0.6936, + "step": 480 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.3775595399172083, + "learning_rate": 0.00017433760391534167, + "loss": 0.7445, + "step": 481 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.37562992179622606, + "learning_rate": 0.00017422190833921283, + "loss": 0.6713, + "step": 482 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4183413380348096, + "learning_rate": 0.0001741059911251997, + "loss": 0.7204, + "step": 483 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.4047559155366429, + "learning_rate": 0.00017398985261944856, + "loss": 0.6899, + "step": 484 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.40330462953906643, + "learning_rate": 0.00017387349316876666, + "loss": 0.7682, + "step": 485 + }, + { + "epoch": 0.2592, + "grad_norm": 0.40961753152529895, + "learning_rate": 0.000173756913120621, + "loss": 0.731, + "step": 486 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.46143460044277884, + "learning_rate": 0.0001736401128231373, + "loss": 0.8323, + "step": 487 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.3902872125584795, + "learning_rate": 0.00017352309262509894, + "loss": 0.7389, + "step": 488 + }, + { + "epoch": 0.2608, + "grad_norm": 0.41083855940817166, + "learning_rate": 0.00017340585287594604, + "loss": 0.7407, + "step": 489 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.4375489330025397, + "learning_rate": 0.0001732883939257742, + "loss": 0.7795, + "step": 490 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.38274951421925013, + "learning_rate": 0.0001731707161253338, + "loss": 0.7433, + "step": 491 + }, + { + "epoch": 0.2624, + "grad_norm": 0.4422124748989928, + "learning_rate": 0.0001730528198260285, + "loss": 0.7672, + "step": 492 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.3959184333586809, + "learning_rate": 0.00017293470537991463, + "loss": 0.6795, + "step": 493 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.6970076830225627, + "learning_rate": 0.00017281637313969978, + "loss": 0.8282, + "step": 494 + }, + { + "epoch": 0.264, + "grad_norm": 0.43077764137115643, + "learning_rate": 0.00017269782345874203, + "loss": 0.7355, + "step": 495 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.37717081863037677, + "learning_rate": 0.00017257905669104874, + "loss": 0.703, + "step": 496 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.47406791963685385, + "learning_rate": 0.00017246007319127545, + "loss": 0.7975, + "step": 497 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4078671203332442, + "learning_rate": 0.00017234087331472497, + "loss": 0.7517, + "step": 498 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.467097270780955, + "learning_rate": 0.00017222145741734626, + "loss": 0.7894, + "step": 499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.3787523725179631, + "learning_rate": 0.00017210182585573327, + "loss": 0.6397, + "step": 500 + }, + { + "epoch": 0.2672, + "grad_norm": 0.43220939878711917, + "learning_rate": 0.00017198197898712404, + "loss": 0.7411, + "step": 501 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.41565474656753976, + "learning_rate": 0.00017186191716939944, + "loss": 0.7615, + "step": 502 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.49033042942470395, + "learning_rate": 0.0001717416407610824, + "loss": 0.8386, + "step": 503 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4011579642645517, + "learning_rate": 0.00017162115012133643, + "loss": 0.722, + "step": 504 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.45892566198535245, + "learning_rate": 0.00017150044560996488, + "loss": 0.8441, + "step": 505 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.41270635318007476, + "learning_rate": 0.00017137952758740978, + "loss": 0.7665, + "step": 506 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4670966351858991, + "learning_rate": 0.00017125839641475072, + "loss": 0.7195, + "step": 507 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.4369010227224225, + "learning_rate": 0.00017113705245370368, + "loss": 0.7719, + "step": 508 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.36176267074976143, + "learning_rate": 0.00017101549606662024, + "loss": 0.6905, + "step": 509 + }, + { + "epoch": 0.272, + "grad_norm": 0.451818585521191, + "learning_rate": 0.00017089372761648616, + "loss": 0.7777, + "step": 510 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.40613248875745106, + "learning_rate": 0.00017077174746692056, + "loss": 0.7077, + "step": 511 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.3843397450463577, + "learning_rate": 0.00017064955598217462, + "loss": 0.6993, + "step": 512 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4272003459053182, + "learning_rate": 0.00017052715352713075, + "loss": 0.7265, + "step": 513 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.42543241976806057, + "learning_rate": 0.00017040454046730115, + "loss": 0.7444, + "step": 514 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.4720208024279849, + "learning_rate": 0.00017028171716882714, + "loss": 0.8301, + "step": 515 + }, + { + "epoch": 0.2752, + "grad_norm": 0.42181444045209004, + "learning_rate": 0.00017015868399847768, + "loss": 0.7391, + "step": 516 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.3736111879553547, + "learning_rate": 0.00017003544132364846, + "loss": 0.7053, + "step": 517 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.42019625065875815, + "learning_rate": 0.00016991198951236088, + "loss": 0.7105, + "step": 518 + }, + { + "epoch": 0.2768, + "grad_norm": 0.3999899407680853, + "learning_rate": 0.00016978832893326074, + "loss": 0.7934, + "step": 519 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.4183766494907831, + "learning_rate": 0.00016966445995561727, + "loss": 0.78, + "step": 520 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.4011487543794493, + "learning_rate": 0.00016954038294932216, + "loss": 0.7253, + "step": 521 + }, + { + "epoch": 0.2784, + "grad_norm": 0.41258677704545643, + "learning_rate": 0.00016941609828488807, + "loss": 0.764, + "step": 522 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.3844441162274048, + "learning_rate": 0.0001692916063334479, + "loss": 0.7392, + "step": 523 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.3906556434475999, + "learning_rate": 0.0001691669074667535, + "loss": 0.7455, + "step": 524 + }, + { + "epoch": 0.28, + "grad_norm": 0.3898053859460415, + "learning_rate": 0.0001690420020571747, + "loss": 0.7486, + "step": 525 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.40164869224700533, + "learning_rate": 0.0001689168904776979, + "loss": 0.7393, + "step": 526 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.3711709136714381, + "learning_rate": 0.00016879157310192535, + "loss": 0.6806, + "step": 527 + }, + { + "epoch": 0.2816, + "grad_norm": 0.37875601700616585, + "learning_rate": 0.0001686660503040737, + "loss": 0.6296, + "step": 528 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.4475727202564068, + "learning_rate": 0.00016854032245897308, + "loss": 0.7294, + "step": 529 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.41107973700270495, + "learning_rate": 0.00016841438994206595, + "loss": 0.7163, + "step": 530 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4278871213826476, + "learning_rate": 0.00016828825312940592, + "loss": 0.7175, + "step": 531 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.47003295264775447, + "learning_rate": 0.00016816191239765667, + "loss": 0.7789, + "step": 532 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.4569330956737104, + "learning_rate": 0.00016803536812409075, + "loss": 0.7781, + "step": 533 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4096933006186417, + "learning_rate": 0.0001679086206865886, + "loss": 0.7598, + "step": 534 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.4191183195577611, + "learning_rate": 0.00016778167046363734, + "loss": 0.7314, + "step": 535 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.3838781414701778, + "learning_rate": 0.00016765451783432953, + "loss": 0.7359, + "step": 536 + }, + { + "epoch": 0.2864, + "grad_norm": 0.43993470287325687, + "learning_rate": 0.00016752716317836229, + "loss": 0.8266, + "step": 537 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.4998185831352091, + "learning_rate": 0.0001673996068760359, + "loss": 0.7512, + "step": 538 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.4474662471696046, + "learning_rate": 0.00016727184930825288, + "loss": 0.7365, + "step": 539 + }, + { + "epoch": 0.288, + "grad_norm": 0.41684893319551, + "learning_rate": 0.0001671438908565167, + "loss": 0.7478, + "step": 540 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.4081069161723271, + "learning_rate": 0.00016701573190293077, + "loss": 0.7478, + "step": 541 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.5424920296466795, + "learning_rate": 0.00016688737283019706, + "loss": 0.7227, + "step": 542 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4148912934325243, + "learning_rate": 0.00016675881402161536, + "loss": 0.7182, + "step": 543 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.4231329818416088, + "learning_rate": 0.00016663005586108176, + "loss": 0.7365, + "step": 544 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.4149981816868053, + "learning_rate": 0.00016650109873308765, + "loss": 0.7979, + "step": 545 + }, + { + "epoch": 0.2912, + "grad_norm": 0.49680839154614576, + "learning_rate": 0.0001663719430227186, + "loss": 0.8492, + "step": 546 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.444439051760402, + "learning_rate": 0.0001662425891156531, + "loss": 0.8233, + "step": 547 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.4121540003234031, + "learning_rate": 0.00016611303739816168, + "loss": 0.7628, + "step": 548 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4291469833584802, + "learning_rate": 0.00016598328825710533, + "loss": 0.7534, + "step": 549 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.3798516076774825, + "learning_rate": 0.00016585334207993476, + "loss": 0.7333, + "step": 550 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.4065165333597523, + "learning_rate": 0.00016572319925468892, + "loss": 0.7981, + "step": 551 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4692693747077233, + "learning_rate": 0.000165592860169994, + "loss": 0.8722, + "step": 552 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.3895284877001692, + "learning_rate": 0.0001654623252150624, + "loss": 0.6476, + "step": 553 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.355722365184838, + "learning_rate": 0.00016533159477969122, + "loss": 0.6832, + "step": 554 + }, + { + "epoch": 0.296, + "grad_norm": 0.4430359057897852, + "learning_rate": 0.00016520066925426144, + "loss": 0.8036, + "step": 555 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.4217197639087501, + "learning_rate": 0.00016506954902973655, + "loss": 0.757, + "step": 556 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.3724598561245106, + "learning_rate": 0.00016493823449766136, + "loss": 0.6265, + "step": 557 + }, + { + "epoch": 0.2976, + "grad_norm": 0.39196229611640815, + "learning_rate": 0.0001648067260501611, + "loss": 0.6382, + "step": 558 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.351739632446928, + "learning_rate": 0.00016467502407993992, + "loss": 0.7245, + "step": 559 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.3646881424861877, + "learning_rate": 0.0001645431289802799, + "loss": 0.6625, + "step": 560 + }, + { + "epoch": 0.2992, + "grad_norm": 0.4580483452429409, + "learning_rate": 0.0001644110411450398, + "loss": 0.8263, + "step": 561 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.4223892248868995, + "learning_rate": 0.00016427876096865394, + "loss": 0.7199, + "step": 562 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.4459595943654794, + "learning_rate": 0.00016414628884613107, + "loss": 0.7978, + "step": 563 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4389200618647841, + "learning_rate": 0.00016401362517305296, + "loss": 0.7366, + "step": 564 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.44107537261661384, + "learning_rate": 0.00016388077034557355, + "loss": 0.7633, + "step": 565 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.37709812051391756, + "learning_rate": 0.00016374772476041748, + "loss": 0.7322, + "step": 566 + }, + { + "epoch": 0.3024, + "grad_norm": 0.40402394223372934, + "learning_rate": 0.00016361448881487914, + "loss": 0.766, + "step": 567 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.48578289266442454, + "learning_rate": 0.00016348106290682118, + "loss": 0.8789, + "step": 568 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.39376794347238314, + "learning_rate": 0.00016334744743467364, + "loss": 0.7705, + "step": 569 + }, + { + "epoch": 0.304, + "grad_norm": 0.3721003314014055, + "learning_rate": 0.00016321364279743266, + "loss": 0.635, + "step": 570 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.49231908734668456, + "learning_rate": 0.00016307964939465914, + "loss": 0.8152, + "step": 571 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.4112118850245869, + "learning_rate": 0.00016294546762647775, + "loss": 0.7467, + "step": 572 + }, + { + "epoch": 0.3056, + "grad_norm": 0.4852694643675637, + "learning_rate": 0.0001628110978935756, + "loss": 0.85, + "step": 573 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.4276293897464289, + "learning_rate": 0.0001626765405972011, + "loss": 0.7662, + "step": 574 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.3908743732434518, + "learning_rate": 0.00016254179613916278, + "loss": 0.7291, + "step": 575 + }, + { + "epoch": 0.3072, + "grad_norm": 0.42742623034741456, + "learning_rate": 0.00016240686492182804, + "loss": 0.7115, + "step": 576 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.3823576940969926, + "learning_rate": 0.000162271747348122, + "loss": 0.7108, + "step": 577 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.4139648570198011, + "learning_rate": 0.0001621364438215262, + "loss": 0.788, + "step": 578 + }, + { + "epoch": 0.3088, + "grad_norm": 0.4430555218440438, + "learning_rate": 0.00016200095474607753, + "loss": 0.7621, + "step": 579 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.4083726535659133, + "learning_rate": 0.00016186528052636692, + "loss": 0.6616, + "step": 580 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.39823684344449684, + "learning_rate": 0.0001617294215675382, + "loss": 0.7481, + "step": 581 + }, + { + "epoch": 0.3104, + "grad_norm": 0.43717230620111364, + "learning_rate": 0.00016159337827528685, + "loss": 0.6599, + "step": 582 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.4055751859684295, + "learning_rate": 0.0001614571510558588, + "loss": 0.7394, + "step": 583 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.37267406752774485, + "learning_rate": 0.00016132074031604917, + "loss": 0.6659, + "step": 584 + }, + { + "epoch": 0.312, + "grad_norm": 0.47133028091157125, + "learning_rate": 0.0001611841464632011, + "loss": 0.7821, + "step": 585 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.42685033370576386, + "learning_rate": 0.00016104736990520468, + "loss": 0.7143, + "step": 586 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.35100355558466984, + "learning_rate": 0.0001609104110504954, + "loss": 0.6531, + "step": 587 + }, + { + "epoch": 0.3136, + "grad_norm": 0.4134879399525361, + "learning_rate": 0.0001607732703080532, + "loss": 0.781, + "step": 588 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.4253772794714469, + "learning_rate": 0.00016063594808740113, + "loss": 0.7855, + "step": 589 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.45883456420996127, + "learning_rate": 0.00016049844479860422, + "loss": 0.7806, + "step": 590 + }, + { + "epoch": 0.3152, + "grad_norm": 0.41643029403734655, + "learning_rate": 0.00016036076085226814, + "loss": 0.738, + "step": 591 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.42932450527570937, + "learning_rate": 0.00016022289665953808, + "loss": 0.7621, + "step": 592 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.40734119656325934, + "learning_rate": 0.00016008485263209742, + "loss": 0.7203, + "step": 593 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3705935827101678, + "learning_rate": 0.0001599466291821666, + "loss": 0.7055, + "step": 594 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.3991816141148357, + "learning_rate": 0.0001598082267225018, + "loss": 0.7175, + "step": 595 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.4104785870045507, + "learning_rate": 0.0001596696456663938, + "loss": 0.6824, + "step": 596 + }, + { + "epoch": 0.3184, + "grad_norm": 0.3705219630510094, + "learning_rate": 0.0001595308864276666, + "loss": 0.6752, + "step": 597 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.41335389806428185, + "learning_rate": 0.00015939194942067646, + "loss": 0.7045, + "step": 598 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.41064141240153385, + "learning_rate": 0.0001592528350603103, + "loss": 0.6749, + "step": 599 + }, + { + "epoch": 0.32, + "grad_norm": 0.38639297771168474, + "learning_rate": 0.0001591135437619847, + "loss": 0.7281, + "step": 600 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.4192855338902871, + "learning_rate": 0.00015897407594164467, + "loss": 0.7633, + "step": 601 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.4073470734439687, + "learning_rate": 0.00015883443201576225, + "loss": 0.7452, + "step": 602 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4381908278901834, + "learning_rate": 0.0001586946124013354, + "loss": 0.7854, + "step": 603 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.4733147031707669, + "learning_rate": 0.00015855461751588677, + "loss": 0.792, + "step": 604 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.4262777998983014, + "learning_rate": 0.0001584144477774623, + "loss": 0.7347, + "step": 605 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4183765799440559, + "learning_rate": 0.0001582741036046301, + "loss": 0.7265, + "step": 606 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.4435370888035211, + "learning_rate": 0.00015813358541647915, + "loss": 0.7408, + "step": 607 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.40791137350226997, + "learning_rate": 0.00015799289363261813, + "loss": 0.7421, + "step": 608 + }, + { + "epoch": 0.3248, + "grad_norm": 0.40533755962194024, + "learning_rate": 0.00015785202867317407, + "loss": 0.6517, + "step": 609 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.41043330111795495, + "learning_rate": 0.00015771099095879108, + "loss": 0.6577, + "step": 610 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.40590348596194553, + "learning_rate": 0.0001575697809106292, + "loss": 0.7266, + "step": 611 + }, + { + "epoch": 0.3264, + "grad_norm": 0.4566038347829871, + "learning_rate": 0.00015742839895036305, + "loss": 0.725, + "step": 612 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.46166239710286383, + "learning_rate": 0.00015728684550018064, + "loss": 0.8159, + "step": 613 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.3927204087715734, + "learning_rate": 0.0001571451209827821, + "loss": 0.6838, + "step": 614 + }, + { + "epoch": 0.328, + "grad_norm": 0.38145862694035654, + "learning_rate": 0.00015700322582137827, + "loss": 0.7287, + "step": 615 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.41014410375622384, + "learning_rate": 0.00015686116043968972, + "loss": 0.7326, + "step": 616 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.4128926143802422, + "learning_rate": 0.00015671892526194516, + "loss": 0.7686, + "step": 617 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4496946594750262, + "learning_rate": 0.0001565765207128805, + "loss": 0.8154, + "step": 618 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.4081721073142836, + "learning_rate": 0.0001564339472177373, + "loss": 0.6923, + "step": 619 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.39323053922880735, + "learning_rate": 0.00015629120520226165, + "loss": 0.7472, + "step": 620 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3889464467389498, + "learning_rate": 0.0001561482950927029, + "loss": 0.6888, + "step": 621 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.3782721459841835, + "learning_rate": 0.0001560052173158123, + "loss": 0.7008, + "step": 622 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.45711474908188415, + "learning_rate": 0.00015586197229884184, + "loss": 0.7355, + "step": 623 + }, + { + "epoch": 0.3328, + "grad_norm": 0.39437508744974215, + "learning_rate": 0.00015571856046954285, + "loss": 0.6854, + "step": 624 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.44044997727986784, + "learning_rate": 0.00015557498225616487, + "loss": 0.7947, + "step": 625 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.48326948955041055, + "learning_rate": 0.0001554312380874542, + "loss": 0.83, + "step": 626 + }, + { + "epoch": 0.3344, + "grad_norm": 0.4688313184034696, + "learning_rate": 0.00015528732839265272, + "loss": 0.791, + "step": 627 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.42255262210533706, + "learning_rate": 0.00015514325360149668, + "loss": 0.7324, + "step": 628 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.4347060818044885, + "learning_rate": 0.0001549990141442153, + "loss": 0.715, + "step": 629 + }, + { + "epoch": 0.336, + "grad_norm": 0.42756407620451586, + "learning_rate": 0.0001548546104515294, + "loss": 0.7515, + "step": 630 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.4513617573308796, + "learning_rate": 0.00015471004295465035, + "loss": 0.8076, + "step": 631 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.43649044118937214, + "learning_rate": 0.0001545653120852787, + "loss": 0.7357, + "step": 632 + }, + { + "epoch": 0.3376, + "grad_norm": 0.46975188650514227, + "learning_rate": 0.00015442041827560274, + "loss": 0.8128, + "step": 633 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.4206316318669492, + "learning_rate": 0.00015427536195829742, + "loss": 0.7124, + "step": 634 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.43103204875720075, + "learning_rate": 0.00015413014356652286, + "loss": 0.741, + "step": 635 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4320759572006151, + "learning_rate": 0.00015398476353392323, + "loss": 0.7615, + "step": 636 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.44756733999939363, + "learning_rate": 0.00015383922229462549, + "loss": 0.7429, + "step": 637 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.46270821466141965, + "learning_rate": 0.00015369352028323774, + "loss": 0.7912, + "step": 638 + }, + { + "epoch": 0.3408, + "grad_norm": 0.40440151158694015, + "learning_rate": 0.00015354765793484834, + "loss": 0.7205, + "step": 639 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.4066183589067994, + "learning_rate": 0.0001534016356850244, + "loss": 0.7136, + "step": 640 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.4428967511926324, + "learning_rate": 0.0001532554539698105, + "loss": 0.8338, + "step": 641 + }, + { + "epoch": 0.3424, + "grad_norm": 0.37756978595071494, + "learning_rate": 0.00015310911322572753, + "loss": 0.7153, + "step": 642 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.39618604131799795, + "learning_rate": 0.00015296261388977108, + "loss": 0.7169, + "step": 643 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.390452592211156, + "learning_rate": 0.0001528159563994104, + "loss": 0.6974, + "step": 644 + }, + { + "epoch": 0.344, + "grad_norm": 0.41959439719106323, + "learning_rate": 0.000152669141192587, + "loss": 0.7304, + "step": 645 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.38357579817872856, + "learning_rate": 0.00015252216870771345, + "loss": 0.7177, + "step": 646 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.37161116856644516, + "learning_rate": 0.00015237503938367186, + "loss": 0.6922, + "step": 647 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4051170055648018, + "learning_rate": 0.00015222775365981273, + "loss": 0.7687, + "step": 648 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.42783911951428605, + "learning_rate": 0.00015208031197595356, + "loss": 0.7296, + "step": 649 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.4580257926351465, + "learning_rate": 0.0001519327147723776, + "loss": 0.786, + "step": 650 + }, + { + "epoch": 0.3472, + "grad_norm": 0.39123493896464817, + "learning_rate": 0.00015178496248983254, + "loss": 0.7313, + "step": 651 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.41507210334543587, + "learning_rate": 0.0001516370555695291, + "loss": 0.7066, + "step": 652 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.481514406389178, + "learning_rate": 0.00015148899445313981, + "loss": 0.8217, + "step": 653 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3943667256577028, + "learning_rate": 0.00015134077958279765, + "loss": 0.6981, + "step": 654 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.4004702594600421, + "learning_rate": 0.00015119241140109467, + "loss": 0.7401, + "step": 655 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.41454716058728625, + "learning_rate": 0.00015104389035108077, + "loss": 0.6944, + "step": 656 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4008158675211356, + "learning_rate": 0.00015089521687626243, + "loss": 0.7121, + "step": 657 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.4086056163398199, + "learning_rate": 0.0001507463914206012, + "loss": 0.7153, + "step": 658 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.42739344037098376, + "learning_rate": 0.0001505974144285124, + "loss": 0.7885, + "step": 659 + }, + { + "epoch": 0.352, + "grad_norm": 0.4046439577379952, + "learning_rate": 0.000150448286344864, + "loss": 0.7344, + "step": 660 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.46612561267966846, + "learning_rate": 0.00015029900761497506, + "loss": 0.7751, + "step": 661 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.3931406497772889, + "learning_rate": 0.00015014957868461458, + "loss": 0.7541, + "step": 662 + }, + { + "epoch": 0.3536, + "grad_norm": 0.37640543832804774, + "learning_rate": 0.00015000000000000001, + "loss": 0.6685, + "step": 663 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.4475185581718164, + "learning_rate": 0.000149850272007796, + "loss": 0.8463, + "step": 664 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.43555120723890317, + "learning_rate": 0.00014970039515511304, + "loss": 0.7967, + "step": 665 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3912352955790987, + "learning_rate": 0.00014955036988950618, + "loss": 0.6932, + "step": 666 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.4099421378533521, + "learning_rate": 0.0001494001966589736, + "loss": 0.7297, + "step": 667 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.416618684089891, + "learning_rate": 0.00014924987591195547, + "loss": 0.7484, + "step": 668 + }, + { + "epoch": 0.3568, + "grad_norm": 0.37639952072149435, + "learning_rate": 0.00014909940809733222, + "loss": 0.7055, + "step": 669 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.4193840133619344, + "learning_rate": 0.0001489487936644237, + "loss": 0.791, + "step": 670 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.39618466742229524, + "learning_rate": 0.00014879803306298736, + "loss": 0.7351, + "step": 671 + }, + { + "epoch": 0.3584, + "grad_norm": 0.380114446442462, + "learning_rate": 0.00014864712674321734, + "loss": 0.7015, + "step": 672 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.39050338871450424, + "learning_rate": 0.00014849607515574276, + "loss": 0.7091, + "step": 673 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.4502586694971886, + "learning_rate": 0.00014834487875162657, + "loss": 0.7017, + "step": 674 + }, + { + "epoch": 0.36, + "grad_norm": 0.418052943596847, + "learning_rate": 0.00014819353798236427, + "loss": 0.7427, + "step": 675 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.42893056517377914, + "learning_rate": 0.00014804205329988225, + "loss": 0.7838, + "step": 676 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.4241874305859737, + "learning_rate": 0.00014789042515653687, + "loss": 0.6904, + "step": 677 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4459911700099912, + "learning_rate": 0.00014773865400511272, + "loss": 0.7458, + "step": 678 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.41032809921787594, + "learning_rate": 0.00014758674029882152, + "loss": 0.6959, + "step": 679 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.3855667042541708, + "learning_rate": 0.00014743468449130063, + "loss": 0.7141, + "step": 680 + }, + { + "epoch": 0.3632, + "grad_norm": 0.4629190402722016, + "learning_rate": 0.00014728248703661182, + "loss": 0.6724, + "step": 681 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.46279704677568356, + "learning_rate": 0.00014713014838923976, + "loss": 0.7906, + "step": 682 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.35871998831217716, + "learning_rate": 0.00014697766900409074, + "loss": 0.6781, + "step": 683 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4930608546681951, + "learning_rate": 0.00014682504933649144, + "loss": 0.8636, + "step": 684 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.44256013503274955, + "learning_rate": 0.0001466722898421873, + "loss": 0.704, + "step": 685 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.38062984848447684, + "learning_rate": 0.0001465193909773413, + "loss": 0.6929, + "step": 686 + }, + { + "epoch": 0.3664, + "grad_norm": 0.37907512503141494, + "learning_rate": 0.00014636635319853275, + "loss": 0.7322, + "step": 687 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.4438719005667484, + "learning_rate": 0.00014621317696275564, + "loss": 0.7368, + "step": 688 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.3757027588614008, + "learning_rate": 0.00014605986272741748, + "loss": 0.7161, + "step": 689 + }, + { + "epoch": 0.368, + "grad_norm": 0.42762741905841856, + "learning_rate": 0.00014590641095033787, + "loss": 0.7858, + "step": 690 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.4603683509064343, + "learning_rate": 0.00014575282208974702, + "loss": 0.8062, + "step": 691 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.4249345365450475, + "learning_rate": 0.00014559909660428468, + "loss": 0.6899, + "step": 692 + }, + { + "epoch": 0.3696, + "grad_norm": 0.4307264048850522, + "learning_rate": 0.00014544523495299842, + "loss": 0.7888, + "step": 693 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.36253477136303247, + "learning_rate": 0.00014529123759534255, + "loss": 0.7107, + "step": 694 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.4346616376243191, + "learning_rate": 0.00014513710499117647, + "loss": 0.7099, + "step": 695 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5116793752500154, + "learning_rate": 0.0001449828376007636, + "loss": 0.8613, + "step": 696 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.46464733367215383, + "learning_rate": 0.00014482843588476974, + "loss": 0.7512, + "step": 697 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.3939198296872801, + "learning_rate": 0.00014467390030426186, + "loss": 0.6331, + "step": 698 + }, + { + "epoch": 0.3728, + "grad_norm": 0.3903839441336436, + "learning_rate": 0.0001445192313207067, + "loss": 0.6779, + "step": 699 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.38809736264352834, + "learning_rate": 0.0001443644293959693, + "loss": 0.7305, + "step": 700 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.44384900300011626, + "learning_rate": 0.00014420949499231172, + "loss": 0.7188, + "step": 701 + }, + { + "epoch": 0.3744, + "grad_norm": 0.4988692150897368, + "learning_rate": 0.0001440544285723915, + "loss": 0.8339, + "step": 702 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.3832189037895236, + "learning_rate": 0.00014389923059926062, + "loss": 0.7105, + "step": 703 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.369498218784444, + "learning_rate": 0.0001437439015363638, + "loss": 0.6724, + "step": 704 + }, + { + "epoch": 0.376, + "grad_norm": 0.5293337597367642, + "learning_rate": 0.00014358844184753712, + "loss": 0.7926, + "step": 705 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.3947743647114129, + "learning_rate": 0.00014343285199700683, + "loss": 0.6456, + "step": 706 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.3697702493035694, + "learning_rate": 0.0001432771324493879, + "loss": 0.751, + "step": 707 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5008865279430565, + "learning_rate": 0.00014312128366968243, + "loss": 0.7691, + "step": 708 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.43144752424878857, + "learning_rate": 0.00014296530612327863, + "loss": 0.7458, + "step": 709 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.42867835474034455, + "learning_rate": 0.00014280920027594907, + "loss": 0.7619, + "step": 710 + }, + { + "epoch": 0.3792, + "grad_norm": 0.47344859479913404, + "learning_rate": 0.00014265296659384956, + "loss": 0.785, + "step": 711 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.4814850246438516, + "learning_rate": 0.00014249660554351752, + "loss": 0.898, + "step": 712 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.4378968855984162, + "learning_rate": 0.00014234011759187083, + "loss": 0.6866, + "step": 713 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4263778469465353, + "learning_rate": 0.00014218350320620624, + "loss": 0.7583, + "step": 714 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.37241853028261956, + "learning_rate": 0.00014202676285419812, + "loss": 0.6812, + "step": 715 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.39373843615558624, + "learning_rate": 0.00014186989700389687, + "loss": 0.7345, + "step": 716 + }, + { + "epoch": 0.3824, + "grad_norm": 0.3873685666617381, + "learning_rate": 0.0001417129061237278, + "loss": 0.7189, + "step": 717 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.4066160705799862, + "learning_rate": 0.0001415557906824895, + "loss": 0.7029, + "step": 718 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.3808234049232231, + "learning_rate": 0.00014139855114935252, + "loss": 0.7282, + "step": 719 + }, + { + "epoch": 0.384, + "grad_norm": 0.42754032553756716, + "learning_rate": 0.00014124118799385796, + "loss": 0.7403, + "step": 720 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.35078303619981066, + "learning_rate": 0.0001410837016859161, + "loss": 0.6562, + "step": 721 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.3615541922056434, + "learning_rate": 0.00014092609269580496, + "loss": 0.6912, + "step": 722 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4006073125444095, + "learning_rate": 0.00014076836149416887, + "loss": 0.7312, + "step": 723 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.4170299356848603, + "learning_rate": 0.00014061050855201723, + "loss": 0.7935, + "step": 724 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.40175583363688694, + "learning_rate": 0.0001404525343407228, + "loss": 0.6934, + "step": 725 + }, + { + "epoch": 0.3872, + "grad_norm": 0.44773593934774614, + "learning_rate": 0.0001402944393320206, + "loss": 0.791, + "step": 726 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.4367014585195122, + "learning_rate": 0.00014013622399800627, + "loss": 0.6847, + "step": 727 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.3894095198340693, + "learning_rate": 0.00013997788881113489, + "loss": 0.6947, + "step": 728 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4343669476666237, + "learning_rate": 0.00013981943424421932, + "loss": 0.7432, + "step": 729 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.3929218878374168, + "learning_rate": 0.0001396608607704289, + "loss": 0.6872, + "step": 730 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.489807724797637, + "learning_rate": 0.0001395021688632882, + "loss": 0.7863, + "step": 731 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3757636600192832, + "learning_rate": 0.00013934335899667527, + "loss": 0.659, + "step": 732 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.3868383973886252, + "learning_rate": 0.00013918443164482046, + "loss": 0.7136, + "step": 733 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.36586286706983623, + "learning_rate": 0.000139025387282305, + "loss": 0.6726, + "step": 734 + }, + { + "epoch": 0.392, + "grad_norm": 0.41106954005266616, + "learning_rate": 0.00013886622638405952, + "loss": 0.7518, + "step": 735 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.3591853030231446, + "learning_rate": 0.0001387069494253626, + "loss": 0.6534, + "step": 736 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.37879426950938166, + "learning_rate": 0.0001385475568818394, + "loss": 0.6909, + "step": 737 + }, + { + "epoch": 0.3936, + "grad_norm": 0.45591841792152743, + "learning_rate": 0.00013838804922946027, + "loss": 0.7621, + "step": 738 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.3412298431432226, + "learning_rate": 0.00013822842694453924, + "loss": 0.645, + "step": 739 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.40544822040894074, + "learning_rate": 0.0001380686905037327, + "loss": 0.6977, + "step": 740 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4086369613667534, + "learning_rate": 0.00013790884038403795, + "loss": 0.7361, + "step": 741 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.38809643021441603, + "learning_rate": 0.00013774887706279165, + "loss": 0.7184, + "step": 742 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.4112905583546399, + "learning_rate": 0.0001375888010176686, + "loss": 0.7576, + "step": 743 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4459939402439977, + "learning_rate": 0.00013742861272668012, + "loss": 0.7349, + "step": 744 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.4050813263806746, + "learning_rate": 0.00013726831266817278, + "loss": 0.7656, + "step": 745 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.3667890962820171, + "learning_rate": 0.00013710790132082692, + "loss": 0.7151, + "step": 746 + }, + { + "epoch": 0.3984, + "grad_norm": 0.40011166345292043, + "learning_rate": 0.00013694737916365517, + "loss": 0.7605, + "step": 747 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.4024398695217591, + "learning_rate": 0.00013678674667600102, + "loss": 0.763, + "step": 748 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.3761447111479893, + "learning_rate": 0.00013662600433753745, + "loss": 0.6506, + "step": 749 + }, + { + "epoch": 0.4, + "grad_norm": 0.45940242277447846, + "learning_rate": 0.00013646515262826552, + "loss": 0.6966, + "step": 750 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.4110400362440283, + "learning_rate": 0.00013630419202851284, + "loss": 0.7007, + "step": 751 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.36473509121148145, + "learning_rate": 0.00013614312301893223, + "loss": 0.6411, + "step": 752 + }, + { + "epoch": 0.4016, + "grad_norm": 0.47718072134643, + "learning_rate": 0.0001359819460805001, + "loss": 0.8045, + "step": 753 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.40243532664369813, + "learning_rate": 0.00013582066169451535, + "loss": 0.7402, + "step": 754 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.37488828093687565, + "learning_rate": 0.0001356592703425976, + "loss": 0.6367, + "step": 755 + }, + { + "epoch": 0.4032, + "grad_norm": 0.39537794135335386, + "learning_rate": 0.0001354977725066859, + "loss": 0.7008, + "step": 756 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.43967264906377374, + "learning_rate": 0.00013533616866903735, + "loss": 0.7082, + "step": 757 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.4079234354206369, + "learning_rate": 0.0001351744593122255, + "loss": 0.7092, + "step": 758 + }, + { + "epoch": 0.4048, + "grad_norm": 0.37155253793525617, + "learning_rate": 0.00013501264491913906, + "loss": 0.681, + "step": 759 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.4031350584934996, + "learning_rate": 0.00013485072597298038, + "loss": 0.7669, + "step": 760 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.4273056447329353, + "learning_rate": 0.00013468870295726398, + "loss": 0.7677, + "step": 761 + }, + { + "epoch": 0.4064, + "grad_norm": 0.40110795325781556, + "learning_rate": 0.0001345265763558152, + "loss": 0.6928, + "step": 762 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.3921495657087361, + "learning_rate": 0.00013436434665276865, + "loss": 0.6846, + "step": 763 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.4043918351556086, + "learning_rate": 0.00013420201433256689, + "loss": 0.6543, + "step": 764 + }, + { + "epoch": 0.408, + "grad_norm": 0.47659688565165054, + "learning_rate": 0.00013403957987995882, + "loss": 0.7496, + "step": 765 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.3984403784576225, + "learning_rate": 0.00013387704377999842, + "loss": 0.7314, + "step": 766 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.4196372282621598, + "learning_rate": 0.00013371440651804313, + "loss": 0.7521, + "step": 767 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3962362919621647, + "learning_rate": 0.0001335516685797525, + "loss": 0.7193, + "step": 768 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.3944029448445006, + "learning_rate": 0.00013338883045108674, + "loss": 0.7618, + "step": 769 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.36807500602234233, + "learning_rate": 0.00013322589261830517, + "loss": 0.6694, + "step": 770 + }, + { + "epoch": 0.4112, + "grad_norm": 0.44695753722008286, + "learning_rate": 0.00013306285556796495, + "loss": 0.7664, + "step": 771 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.42317405188437496, + "learning_rate": 0.0001328997197869194, + "loss": 0.7793, + "step": 772 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.3748121804380357, + "learning_rate": 0.0001327364857623168, + "loss": 0.6677, + "step": 773 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4038810920792045, + "learning_rate": 0.00013257315398159864, + "loss": 0.7502, + "step": 774 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.3791411563791544, + "learning_rate": 0.00013240972493249847, + "loss": 0.6638, + "step": 775 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.36400656378469093, + "learning_rate": 0.0001322461991030402, + "loss": 0.6415, + "step": 776 + }, + { + "epoch": 0.4144, + "grad_norm": 0.4139989075328661, + "learning_rate": 0.00013208257698153677, + "loss": 0.7085, + "step": 777 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.41415205429654484, + "learning_rate": 0.00013191885905658872, + "loss": 0.7184, + "step": 778 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.47590277067949127, + "learning_rate": 0.0001317550458170826, + "loss": 0.7941, + "step": 779 + }, + { + "epoch": 0.416, + "grad_norm": 0.3797426296738849, + "learning_rate": 0.00013159113775218964, + "loss": 0.7064, + "step": 780 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.4200393699368836, + "learning_rate": 0.00013142713535136414, + "loss": 0.7765, + "step": 781 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.434728398766787, + "learning_rate": 0.00013126303910434214, + "loss": 0.7253, + "step": 782 + }, + { + "epoch": 0.4176, + "grad_norm": 0.372873424526013, + "learning_rate": 0.00013109884950114007, + "loss": 0.6113, + "step": 783 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.3831484277663798, + "learning_rate": 0.00013093456703205288, + "loss": 0.6585, + "step": 784 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.42422980631296836, + "learning_rate": 0.00013077019218765305, + "loss": 0.7388, + "step": 785 + }, + { + "epoch": 0.4192, + "grad_norm": 0.47442795680104105, + "learning_rate": 0.00013060572545878875, + "loss": 0.7876, + "step": 786 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.38281098481650605, + "learning_rate": 0.0001304411673365826, + "loss": 0.6944, + "step": 787 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.44915982456514136, + "learning_rate": 0.0001302765183124302, + "loss": 0.7297, + "step": 788 + }, + { + "epoch": 0.4208, + "grad_norm": 0.6527029321721289, + "learning_rate": 0.00013011177887799845, + "loss": 0.8431, + "step": 789 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.45960250321243257, + "learning_rate": 0.00012994694952522435, + "loss": 0.8288, + "step": 790 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.4774546620048671, + "learning_rate": 0.00012978203074631334, + "loss": 0.7217, + "step": 791 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3828219672976716, + "learning_rate": 0.00012961702303373795, + "loss": 0.693, + "step": 792 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.42307392782787945, + "learning_rate": 0.00012945192688023624, + "loss": 0.7434, + "step": 793 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.38857163031113473, + "learning_rate": 0.0001292867427788104, + "loss": 0.6445, + "step": 794 + }, + { + "epoch": 0.424, + "grad_norm": 0.3805358298818372, + "learning_rate": 0.00012912147122272523, + "loss": 0.6871, + "step": 795 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.3328935629550907, + "learning_rate": 0.00012895611270550666, + "loss": 0.6533, + "step": 796 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.3304526641680735, + "learning_rate": 0.0001287906677209403, + "loss": 0.598, + "step": 797 + }, + { + "epoch": 0.4256, + "grad_norm": 0.502932741405047, + "learning_rate": 0.00012862513676307008, + "loss": 0.7889, + "step": 798 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.37714530248134004, + "learning_rate": 0.0001284595203261965, + "loss": 0.7182, + "step": 799 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.4282676032795805, + "learning_rate": 0.00012829381890487536, + "loss": 0.8043, + "step": 800 + }, + { + "epoch": 0.4272, + "grad_norm": 0.40579295892150946, + "learning_rate": 0.00012812803299391628, + "loss": 0.7614, + "step": 801 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.43793201883830046, + "learning_rate": 0.00012796216308838117, + "loss": 0.7839, + "step": 802 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.3925544721926815, + "learning_rate": 0.00012779620968358273, + "loss": 0.7125, + "step": 803 + }, + { + "epoch": 0.4288, + "grad_norm": 0.36328082834695424, + "learning_rate": 0.00012763017327508305, + "loss": 0.658, + "step": 804 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.3981520390409742, + "learning_rate": 0.00012746405435869198, + "loss": 0.7413, + "step": 805 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.4186646716736007, + "learning_rate": 0.00012729785343046588, + "loss": 0.7108, + "step": 806 + }, + { + "epoch": 0.4304, + "grad_norm": 0.4167507640819403, + "learning_rate": 0.0001271315709867059, + "loss": 0.7129, + "step": 807 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.42672646152710686, + "learning_rate": 0.00012696520752395672, + "loss": 0.6867, + "step": 808 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.40609769487387265, + "learning_rate": 0.00012679876353900482, + "loss": 0.7522, + "step": 809 + }, + { + "epoch": 0.432, + "grad_norm": 0.40760676107973753, + "learning_rate": 0.00012663223952887723, + "loss": 0.6635, + "step": 810 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.4528448358687043, + "learning_rate": 0.00012646563599083996, + "loss": 0.7825, + "step": 811 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.40701529720953555, + "learning_rate": 0.00012629895342239643, + "loss": 0.7173, + "step": 812 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4020643001179437, + "learning_rate": 0.00012613219232128608, + "loss": 0.6688, + "step": 813 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.40345527344678944, + "learning_rate": 0.00012596535318548289, + "loss": 0.7329, + "step": 814 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.4242645114052418, + "learning_rate": 0.0001257984365131938, + "loss": 0.7506, + "step": 815 + }, + { + "epoch": 0.4352, + "grad_norm": 0.38049829750202047, + "learning_rate": 0.00012563144280285741, + "loss": 0.7156, + "step": 816 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.38242188825243595, + "learning_rate": 0.00012546437255314222, + "loss": 0.6209, + "step": 817 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.4326541988919919, + "learning_rate": 0.0001252972262629454, + "loss": 0.724, + "step": 818 + }, + { + "epoch": 0.4368, + "grad_norm": 0.40387155552869813, + "learning_rate": 0.00012513000443139112, + "loss": 0.7514, + "step": 819 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.4213488843736418, + "learning_rate": 0.00012496270755782914, + "loss": 0.7085, + "step": 820 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.4219765834921627, + "learning_rate": 0.00012479533614183334, + "loss": 0.8062, + "step": 821 + }, + { + "epoch": 0.4384, + "grad_norm": 0.44626299335647784, + "learning_rate": 0.00012462789068320017, + "loss": 0.7364, + "step": 822 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.35917611007956723, + "learning_rate": 0.00012446037168194714, + "loss": 0.6436, + "step": 823 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.4284702509214677, + "learning_rate": 0.00012429277963831148, + "loss": 0.7113, + "step": 824 + }, + { + "epoch": 0.44, + "grad_norm": 0.3718477597101707, + "learning_rate": 0.00012412511505274844, + "loss": 0.6734, + "step": 825 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.37388387620180413, + "learning_rate": 0.00012395737842592995, + "loss": 0.6879, + "step": 826 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.437811789789759, + "learning_rate": 0.000123789570258743, + "loss": 0.7298, + "step": 827 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3667673569730156, + "learning_rate": 0.00012362169105228826, + "loss": 0.6608, + "step": 828 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.38890167788437835, + "learning_rate": 0.00012345374130787854, + "loss": 0.6963, + "step": 829 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.3776600784418698, + "learning_rate": 0.00012328572152703725, + "loss": 0.642, + "step": 830 + }, + { + "epoch": 0.4432, + "grad_norm": 0.443445459098556, + "learning_rate": 0.000123117632211497, + "loss": 0.7899, + "step": 831 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.41217818516299154, + "learning_rate": 0.00012294947386319794, + "loss": 0.7527, + "step": 832 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.40045702787906, + "learning_rate": 0.0001227812469842864, + "loss": 0.6717, + "step": 833 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4880291023513915, + "learning_rate": 0.00012261295207711346, + "loss": 0.7917, + "step": 834 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.3863030279047588, + "learning_rate": 0.00012244458964423327, + "loss": 0.7036, + "step": 835 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.38075490277545865, + "learning_rate": 0.00012227616018840154, + "loss": 0.7006, + "step": 836 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4279247073163393, + "learning_rate": 0.0001221076642125742, + "loss": 0.6853, + "step": 837 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.4227832738664886, + "learning_rate": 0.00012193910221990581, + "loss": 0.6787, + "step": 838 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.42573542527244784, + "learning_rate": 0.00012177047471374807, + "loss": 0.7552, + "step": 839 + }, + { + "epoch": 0.448, + "grad_norm": 0.399384715886149, + "learning_rate": 0.00012160178219764837, + "loss": 0.6697, + "step": 840 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.4463188348707578, + "learning_rate": 0.0001214330251753481, + "loss": 0.7402, + "step": 841 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.3759968799713907, + "learning_rate": 0.00012126420415078132, + "loss": 0.6409, + "step": 842 + }, + { + "epoch": 0.4496, + "grad_norm": 0.37556405363020284, + "learning_rate": 0.00012109531962807332, + "loss": 0.7072, + "step": 843 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.4421198755249828, + "learning_rate": 0.00012092637211153885, + "loss": 0.7361, + "step": 844 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.39512134390623027, + "learning_rate": 0.0001207573621056809, + "loss": 0.6605, + "step": 845 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3855507896392054, + "learning_rate": 0.00012058829011518896, + "loss": 0.6841, + "step": 846 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.4220550953706188, + "learning_rate": 0.00012041915664493761, + "loss": 0.6901, + "step": 847 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.5037067808110443, + "learning_rate": 0.00012024996219998517, + "loss": 0.7213, + "step": 848 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3699300074793161, + "learning_rate": 0.00012008070728557186, + "loss": 0.6832, + "step": 849 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.3990881923428507, + "learning_rate": 0.00011991139240711857, + "loss": 0.6839, + "step": 850 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.4612287868870594, + "learning_rate": 0.00011974201807022525, + "loss": 0.7065, + "step": 851 + }, + { + "epoch": 0.4544, + "grad_norm": 0.43290972291491714, + "learning_rate": 0.00011957258478066931, + "loss": 0.7652, + "step": 852 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.3696005701833874, + "learning_rate": 0.00011940309304440433, + "loss": 0.6498, + "step": 853 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.41640436222988, + "learning_rate": 0.00011923354336755835, + "loss": 0.6839, + "step": 854 + }, + { + "epoch": 0.456, + "grad_norm": 0.41028691133947254, + "learning_rate": 0.00011906393625643244, + "loss": 0.7063, + "step": 855 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.3877045556479844, + "learning_rate": 0.00011889427221749916, + "loss": 0.6798, + "step": 856 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.3590655618247176, + "learning_rate": 0.00011872455175740112, + "loss": 0.6322, + "step": 857 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4520343453078009, + "learning_rate": 0.00011855477538294935, + "loss": 0.7226, + "step": 858 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.41971932607213375, + "learning_rate": 0.00011838494360112185, + "loss": 0.7178, + "step": 859 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.3879343467070938, + "learning_rate": 0.00011821505691906216, + "loss": 0.6642, + "step": 860 + }, + { + "epoch": 0.4592, + "grad_norm": 0.4005189721475758, + "learning_rate": 0.00011804511584407763, + "loss": 0.6933, + "step": 861 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.42617766373244964, + "learning_rate": 0.00011787512088363817, + "loss": 0.7703, + "step": 862 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.41182650350301025, + "learning_rate": 0.00011770507254537453, + "loss": 0.6767, + "step": 863 + }, + { + "epoch": 0.4608, + "grad_norm": 0.40678105665865283, + "learning_rate": 0.00011753497133707679, + "loss": 0.7811, + "step": 864 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.37243949404933646, + "learning_rate": 0.00011736481776669306, + "loss": 0.6604, + "step": 865 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.3769552998407579, + "learning_rate": 0.00011719461234232764, + "loss": 0.6583, + "step": 866 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4373732641135685, + "learning_rate": 0.00011702435557223987, + "loss": 0.7811, + "step": 867 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.425244461080885, + "learning_rate": 0.00011685404796484225, + "loss": 0.7476, + "step": 868 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.3876880020936677, + "learning_rate": 0.00011668369002869912, + "loss": 0.7322, + "step": 869 + }, + { + "epoch": 0.464, + "grad_norm": 0.4570638397527968, + "learning_rate": 0.00011651328227252517, + "loss": 0.7778, + "step": 870 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.4224961564350872, + "learning_rate": 0.00011634282520518383, + "loss": 0.7539, + "step": 871 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.39304313298896326, + "learning_rate": 0.00011617231933568578, + "loss": 0.7135, + "step": 872 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4828825334181347, + "learning_rate": 0.00011600176517318741, + "loss": 0.7783, + "step": 873 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.39283858698300056, + "learning_rate": 0.00011583116322698935, + "loss": 0.7025, + "step": 874 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.3573495234761727, + "learning_rate": 0.00011566051400653486, + "loss": 0.6791, + "step": 875 + }, + { + "epoch": 0.4672, + "grad_norm": 0.37294525593683886, + "learning_rate": 0.00011548981802140848, + "loss": 0.6384, + "step": 876 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.3963777636690801, + "learning_rate": 0.00011531907578133429, + "loss": 0.6534, + "step": 877 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.4180680996400134, + "learning_rate": 0.00011514828779617459, + "loss": 0.7239, + "step": 878 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4042374723954916, + "learning_rate": 0.00011497745457592816, + "loss": 0.6725, + "step": 879 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.3534423154198521, + "learning_rate": 0.00011480657663072896, + "loss": 0.601, + "step": 880 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.388492454037105, + "learning_rate": 0.00011463565447084445, + "loss": 0.6695, + "step": 881 + }, + { + "epoch": 0.4704, + "grad_norm": 0.44272789321840517, + "learning_rate": 0.00011446468860667421, + "loss": 0.7276, + "step": 882 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.3845541353315032, + "learning_rate": 0.00011429367954874819, + "loss": 0.6676, + "step": 883 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.36052534637825623, + "learning_rate": 0.0001141226278077254, + "loss": 0.6312, + "step": 884 + }, + { + "epoch": 0.472, + "grad_norm": 0.4274973330176012, + "learning_rate": 0.00011395153389439233, + "loss": 0.7132, + "step": 885 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.40812052906393176, + "learning_rate": 0.00011378039831966134, + "loss": 0.7252, + "step": 886 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.47381163596812353, + "learning_rate": 0.00011360922159456928, + "loss": 0.6752, + "step": 887 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3911207899628515, + "learning_rate": 0.00011343800423027582, + "loss": 0.724, + "step": 888 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.4758575998832261, + "learning_rate": 0.00011326674673806195, + "loss": 0.7727, + "step": 889 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.4391072997272904, + "learning_rate": 0.00011309544962932862, + "loss": 0.7659, + "step": 890 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3710229549017468, + "learning_rate": 0.0001129241134155949, + "loss": 0.6811, + "step": 891 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.3801864537301717, + "learning_rate": 0.00011275273860849684, + "loss": 0.6955, + "step": 892 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.41131982423697444, + "learning_rate": 0.00011258132571978555, + "loss": 0.7136, + "step": 893 + }, + { + "epoch": 0.4768, + "grad_norm": 0.370602754197648, + "learning_rate": 0.00011240987526132594, + "loss": 0.6703, + "step": 894 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.41842152101094837, + "learning_rate": 0.00011223838774509514, + "loss": 0.7199, + "step": 895 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.46319395319681844, + "learning_rate": 0.00011206686368318086, + "loss": 0.7653, + "step": 896 + }, + { + "epoch": 0.4784, + "grad_norm": 0.351304136769783, + "learning_rate": 0.00011189530358778005, + "loss": 0.6267, + "step": 897 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.3788732446079422, + "learning_rate": 0.00011172370797119712, + "loss": 0.6522, + "step": 898 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.4775016588998876, + "learning_rate": 0.00011155207734584263, + "loss": 0.6575, + "step": 899 + }, + { + "epoch": 0.48, + "grad_norm": 0.4205234225489854, + "learning_rate": 0.00011138041222423177, + "loss": 0.7306, + "step": 900 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.404234104372383, + "learning_rate": 0.00011120871311898254, + "loss": 0.7095, + "step": 901 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.39376381823555934, + "learning_rate": 0.0001110369805428146, + "loss": 0.6596, + "step": 902 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3808123774052564, + "learning_rate": 0.00011086521500854745, + "loss": 0.6682, + "step": 903 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.4229128793483658, + "learning_rate": 0.0001106934170290991, + "loss": 0.7182, + "step": 904 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.40319828089237675, + "learning_rate": 0.00011052158711748434, + "loss": 0.7313, + "step": 905 + }, + { + "epoch": 0.4832, + "grad_norm": 0.37310361858205293, + "learning_rate": 0.00011034972578681338, + "loss": 0.6417, + "step": 906 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.38928550027066544, + "learning_rate": 0.00011017783355029026, + "loss": 0.6768, + "step": 907 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.3970654743605492, + "learning_rate": 0.00011000591092121127, + "loss": 0.6732, + "step": 908 + }, + { + "epoch": 0.4848, + "grad_norm": 0.4251433741320817, + "learning_rate": 0.00010983395841296348, + "loss": 0.7463, + "step": 909 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.44272744274351933, + "learning_rate": 0.0001096619765390232, + "loss": 0.7035, + "step": 910 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.3849067465010661, + "learning_rate": 0.00010948996581295436, + "loss": 0.6871, + "step": 911 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4225859295226142, + "learning_rate": 0.00010931792674840718, + "loss": 0.7724, + "step": 912 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.43786770910781314, + "learning_rate": 0.00010914585985911632, + "loss": 0.7049, + "step": 913 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.37997218728789206, + "learning_rate": 0.00010897376565889971, + "loss": 0.6473, + "step": 914 + }, + { + "epoch": 0.488, + "grad_norm": 0.4191191877744639, + "learning_rate": 0.00010880164466165674, + "loss": 0.7154, + "step": 915 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.37749020920129717, + "learning_rate": 0.00010862949738136681, + "loss": 0.6718, + "step": 916 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.5116400364670469, + "learning_rate": 0.00010845732433208779, + "loss": 0.7458, + "step": 917 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3797726500887027, + "learning_rate": 0.00010828512602795462, + "loss": 0.6239, + "step": 918 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.40612311974486387, + "learning_rate": 0.00010811290298317755, + "loss": 0.6561, + "step": 919 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.4550093082713764, + "learning_rate": 0.00010794065571204072, + "loss": 0.7613, + "step": 920 + }, + { + "epoch": 0.4912, + "grad_norm": 0.45656721520114374, + "learning_rate": 0.00010776838472890065, + "loss": 0.7069, + "step": 921 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.43808840491427853, + "learning_rate": 0.00010759609054818458, + "loss": 0.7258, + "step": 922 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.40561370160676213, + "learning_rate": 0.00010742377368438914, + "loss": 0.6875, + "step": 923 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4590103093416851, + "learning_rate": 0.00010725143465207867, + "loss": 0.7326, + "step": 924 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.40124825398399744, + "learning_rate": 0.00010707907396588361, + "loss": 0.695, + "step": 925 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.3936243245443685, + "learning_rate": 0.0001069066921404992, + "loss": 0.6688, + "step": 926 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3870139727112456, + "learning_rate": 0.00010673428969068364, + "loss": 0.673, + "step": 927 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.44462372212712714, + "learning_rate": 0.00010656186713125689, + "loss": 0.7599, + "step": 928 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.3956370931937604, + "learning_rate": 0.0001063894249770989, + "loss": 0.6413, + "step": 929 + }, + { + "epoch": 0.496, + "grad_norm": 0.3996632685907131, + "learning_rate": 0.00010621696374314807, + "loss": 0.6835, + "step": 930 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.4068104029537071, + "learning_rate": 0.00010604448394439983, + "loss": 0.7157, + "step": 931 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.42322336691701257, + "learning_rate": 0.00010587198609590505, + "loss": 0.7218, + "step": 932 + }, + { + "epoch": 0.4976, + "grad_norm": 0.4395433933653365, + "learning_rate": 0.00010569947071276847, + "loss": 0.6782, + "step": 933 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.38059636946312014, + "learning_rate": 0.00010552693831014726, + "loss": 0.6609, + "step": 934 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.39326972293461676, + "learning_rate": 0.0001053543894032493, + "loss": 0.6654, + "step": 935 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3589359665208239, + "learning_rate": 0.00010518182450733186, + "loss": 0.6738, + "step": 936 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.7439200515520827, + "learning_rate": 0.00010500924413769988, + "loss": 0.747, + "step": 937 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.40245284090630545, + "learning_rate": 0.00010483664880970457, + "loss": 0.6778, + "step": 938 + }, + { + "epoch": 0.5008, + "grad_norm": 0.38695190004885227, + "learning_rate": 0.00010466403903874176, + "loss": 0.6907, + "step": 939 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.41700901306124494, + "learning_rate": 0.00010449141534025045, + "loss": 0.6924, + "step": 940 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.4322749113351211, + "learning_rate": 0.00010431877822971117, + "loss": 0.6861, + "step": 941 + }, + { + "epoch": 0.5024, + "grad_norm": 0.40759648053081476, + "learning_rate": 0.00010414612822264455, + "loss": 0.6973, + "step": 942 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.4064060370392942, + "learning_rate": 0.00010397346583460971, + "loss": 0.6783, + "step": 943 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.38600386952418253, + "learning_rate": 0.0001038007915812028, + "loss": 0.6561, + "step": 944 + }, + { + "epoch": 0.504, + "grad_norm": 0.390818005507248, + "learning_rate": 0.00010362810597805526, + "loss": 0.6918, + "step": 945 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.41219389096047593, + "learning_rate": 0.0001034554095408326, + "loss": 0.6661, + "step": 946 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.3733957810465592, + "learning_rate": 0.00010328270278523256, + "loss": 0.6712, + "step": 947 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4108558643674988, + "learning_rate": 0.0001031099862269837, + "loss": 0.715, + "step": 948 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.3971854201780952, + "learning_rate": 0.00010293726038184393, + "loss": 0.7098, + "step": 949 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.43214348210498155, + "learning_rate": 0.00010276452576559879, + "loss": 0.697, + "step": 950 + }, + { + "epoch": 0.5072, + "grad_norm": 0.43256766403814767, + "learning_rate": 0.00010259178289406011, + "loss": 0.7776, + "step": 951 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.4031860596371652, + "learning_rate": 0.00010241903228306431, + "loss": 0.6837, + "step": 952 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.373606372821283, + "learning_rate": 0.0001022462744484709, + "loss": 0.6526, + "step": 953 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3609247548304206, + "learning_rate": 0.00010207350990616107, + "loss": 0.6742, + "step": 954 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.36064261171581113, + "learning_rate": 0.00010190073917203589, + "loss": 0.6409, + "step": 955 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.4271338586204693, + "learning_rate": 0.00010172796276201503, + "loss": 0.7081, + "step": 956 + }, + { + "epoch": 0.5104, + "grad_norm": 0.4337110973339837, + "learning_rate": 0.0001015551811920351, + "loss": 0.6522, + "step": 957 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.4067268385500411, + "learning_rate": 0.00010138239497804804, + "loss": 0.6979, + "step": 958 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.422313982241205, + "learning_rate": 0.00010120960463601976, + "loss": 0.7357, + "step": 959 + }, + { + "epoch": 0.512, + "grad_norm": 0.4283993469438326, + "learning_rate": 0.00010103681068192845, + "loss": 0.7276, + "step": 960 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.35585101619802095, + "learning_rate": 0.00010086401363176305, + "loss": 0.6589, + "step": 961 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.3885030459699722, + "learning_rate": 0.00010069121400152181, + "loss": 0.6157, + "step": 962 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4611681571492855, + "learning_rate": 0.00010051841230721065, + "loss": 0.7334, + "step": 963 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.3789298367290173, + "learning_rate": 0.0001003456090648416, + "loss": 0.6339, + "step": 964 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.3955283433599766, + "learning_rate": 0.00010017280479043147, + "loss": 0.6829, + "step": 965 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3828712806641332, + "learning_rate": 0.0001, + "loss": 0.6907, + "step": 966 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.44360063787426984, + "learning_rate": 9.982719520956855e-05, + "loss": 0.7593, + "step": 967 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.41959626066678757, + "learning_rate": 9.965439093515841e-05, + "loss": 0.6632, + "step": 968 + }, + { + "epoch": 0.5168, + "grad_norm": 0.4178887763089197, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7088, + "step": 969 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.4273794934405812, + "learning_rate": 9.930878599847821e-05, + "loss": 0.6826, + "step": 970 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.44613303752223565, + "learning_rate": 9.913598636823693e-05, + "loss": 0.6792, + "step": 971 + }, + { + "epoch": 0.5184, + "grad_norm": 0.41113502584267975, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6822, + "step": 972 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.40633883955027034, + "learning_rate": 9.879039536398024e-05, + "loss": 0.7026, + "step": 973 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.4383419652470069, + "learning_rate": 9.861760502195197e-05, + "loss": 0.7594, + "step": 974 + }, + { + "epoch": 0.52, + "grad_norm": 0.4428434561270538, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7308, + "step": 975 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.3884338091203555, + "learning_rate": 9.827203723798498e-05, + "loss": 0.6943, + "step": 976 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.37470666307947165, + "learning_rate": 9.809926082796415e-05, + "loss": 0.6707, + "step": 977 + }, + { + "epoch": 0.5216, + "grad_norm": 0.41801931443942375, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6353, + "step": 978 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.40988747215255444, + "learning_rate": 9.775372555152912e-05, + "loss": 0.7005, + "step": 979 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.39827425654353676, + "learning_rate": 9.758096771693573e-05, + "loss": 0.6945, + "step": 980 + }, + { + "epoch": 0.5232, + "grad_norm": 0.4558415624953074, + "learning_rate": 9.740821710593989e-05, + "loss": 0.7509, + "step": 981 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.3526514170147418, + "learning_rate": 9.723547423440122e-05, + "loss": 0.6492, + "step": 982 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.4579002707791232, + "learning_rate": 9.70627396181561e-05, + "loss": 0.7733, + "step": 983 + }, + { + "epoch": 0.5248, + "grad_norm": 0.40582604777311926, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7124, + "step": 984 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.4418290278290973, + "learning_rate": 9.671729721476746e-05, + "loss": 0.7811, + "step": 985 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.38216328313521813, + "learning_rate": 9.654459045916743e-05, + "loss": 0.6807, + "step": 986 + }, + { + "epoch": 0.5264, + "grad_norm": 0.3801398406592987, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7571, + "step": 987 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.36203756917696256, + "learning_rate": 9.619920841879725e-05, + "loss": 0.6301, + "step": 988 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.4068819481440644, + "learning_rate": 9.602653416539031e-05, + "loss": 0.6912, + "step": 989 + }, + { + "epoch": 0.528, + "grad_norm": 0.4427916418611001, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7901, + "step": 990 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.36702428866772624, + "learning_rate": 9.568122177028884e-05, + "loss": 0.646, + "step": 991 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.39937231114065663, + "learning_rate": 9.550858465974958e-05, + "loss": 0.7289, + "step": 992 + }, + { + "epoch": 0.5296, + "grad_norm": 0.42816775455491346, + "learning_rate": 9.533596096125825e-05, + "loss": 0.7307, + "step": 993 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.38077046269227127, + "learning_rate": 9.516335119029546e-05, + "loss": 0.6121, + "step": 994 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.3923238270995547, + "learning_rate": 9.499075586230013e-05, + "loss": 0.6923, + "step": 995 + }, + { + "epoch": 0.5312, + "grad_norm": 0.36767549701070884, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6722, + "step": 996 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.36190059027736843, + "learning_rate": 9.464561059675073e-05, + "loss": 0.6385, + "step": 997 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.3856851764676648, + "learning_rate": 9.44730616898528e-05, + "loss": 0.7118, + "step": 998 + }, + { + "epoch": 0.5328, + "grad_norm": 0.37045340896227735, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6541, + "step": 999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4018945451754231, + "learning_rate": 9.412801390409497e-05, + "loss": 0.707, + "step": 1000 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.37834413149463314, + "learning_rate": 9.395551605560018e-05, + "loss": 0.7148, + "step": 1001 + }, + { + "epoch": 0.5344, + "grad_norm": 0.40264628353900195, + "learning_rate": 9.378303625685195e-05, + "loss": 0.7668, + "step": 1002 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.3813446944853318, + "learning_rate": 9.361057502290113e-05, + "loss": 0.6816, + "step": 1003 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.45300385140327293, + "learning_rate": 9.343813286874312e-05, + "loss": 0.695, + "step": 1004 + }, + { + "epoch": 0.536, + "grad_norm": 0.4318609221754859, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7173, + "step": 1005 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.3818750659761753, + "learning_rate": 9.309330785950086e-05, + "loss": 0.707, + "step": 1006 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.41832051824530075, + "learning_rate": 9.292092603411641e-05, + "loss": 0.67, + "step": 1007 + }, + { + "epoch": 0.5376, + "grad_norm": 0.47118792133565957, + "learning_rate": 9.274856534792138e-05, + "loss": 0.815, + "step": 1008 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.45440639362174423, + "learning_rate": 9.257622631561085e-05, + "loss": 0.8078, + "step": 1009 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.41139339937931796, + "learning_rate": 9.240390945181543e-05, + "loss": 0.6589, + "step": 1010 + }, + { + "epoch": 0.5392, + "grad_norm": 0.4242877542971082, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6894, + "step": 1011 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.4122314626824244, + "learning_rate": 9.205934428795929e-05, + "loss": 0.6659, + "step": 1012 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.3942405704639682, + "learning_rate": 9.188709701682247e-05, + "loss": 0.6372, + "step": 1013 + }, + { + "epoch": 0.5408, + "grad_norm": 0.48786014425422236, + "learning_rate": 9.171487397204539e-05, + "loss": 0.758, + "step": 1014 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.4586009322004272, + "learning_rate": 9.154267566791223e-05, + "loss": 0.7957, + "step": 1015 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.3481292378201403, + "learning_rate": 9.137050261863324e-05, + "loss": 0.6301, + "step": 1016 + }, + { + "epoch": 0.5424, + "grad_norm": 0.4261357442235494, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6973, + "step": 1017 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.43015655288662064, + "learning_rate": 9.102623434110028e-05, + "loss": 0.6912, + "step": 1018 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.44261951636802793, + "learning_rate": 9.085414014088369e-05, + "loss": 0.7114, + "step": 1019 + }, + { + "epoch": 0.544, + "grad_norm": 0.39275232774805957, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7018, + "step": 1020 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.36966361537506054, + "learning_rate": 9.051003418704565e-05, + "loss": 0.7008, + "step": 1021 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.39882434045775006, + "learning_rate": 9.033802346097682e-05, + "loss": 0.6969, + "step": 1022 + }, + { + "epoch": 0.5456, + "grad_norm": 0.4307931222905577, + "learning_rate": 9.016604158703654e-05, + "loss": 0.7471, + "step": 1023 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.412152370318337, + "learning_rate": 8.999408907878877e-05, + "loss": 0.7388, + "step": 1024 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.44841521091369047, + "learning_rate": 8.982216644970979e-05, + "loss": 0.7133, + "step": 1025 + }, + { + "epoch": 0.5472, + "grad_norm": 0.38746110189562677, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6278, + "step": 1026 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.3642761748368395, + "learning_rate": 8.947841288251568e-05, + "loss": 0.6477, + "step": 1027 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.38558497241363154, + "learning_rate": 8.930658297090091e-05, + "loss": 0.6808, + "step": 1028 + }, + { + "epoch": 0.5488, + "grad_norm": 0.42269928725720274, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6766, + "step": 1029 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.4684943238768462, + "learning_rate": 8.896301945718541e-05, + "loss": 0.714, + "step": 1030 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.45473212582450806, + "learning_rate": 8.879128688101749e-05, + "loss": 0.7519, + "step": 1031 + }, + { + "epoch": 0.5504, + "grad_norm": 0.38370641760493457, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6784, + "step": 1032 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.4446554193607567, + "learning_rate": 8.844792265415738e-05, + "loss": 0.7339, + "step": 1033 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.5021184919345888, + "learning_rate": 8.827629202880293e-05, + "loss": 0.701, + "step": 1034 + }, + { + "epoch": 0.552, + "grad_norm": 0.37286008172473245, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6382, + "step": 1035 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.3898261852650365, + "learning_rate": 8.793313631681915e-05, + "loss": 0.6524, + "step": 1036 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.39075418110480387, + "learning_rate": 8.776161225490489e-05, + "loss": 0.6586, + "step": 1037 + }, + { + "epoch": 0.5536, + "grad_norm": 0.34568309289503596, + "learning_rate": 8.759012473867407e-05, + "loss": 0.5977, + "step": 1038 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.37554068268681406, + "learning_rate": 8.741867428021446e-05, + "loss": 0.7089, + "step": 1039 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.4318235268052963, + "learning_rate": 8.724726139150318e-05, + "loss": 0.7457, + "step": 1040 + }, + { + "epoch": 0.5552, + "grad_norm": 0.41952136163111886, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6684, + "step": 1041 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.3535850500532471, + "learning_rate": 8.690455037067141e-05, + "loss": 0.6302, + "step": 1042 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.40140318453092033, + "learning_rate": 8.673325326193806e-05, + "loss": 0.667, + "step": 1043 + }, + { + "epoch": 0.5568, + "grad_norm": 0.4126501746046163, + "learning_rate": 8.656199576972423e-05, + "loss": 0.7622, + "step": 1044 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.36902047840591456, + "learning_rate": 8.639077840543077e-05, + "loss": 0.6748, + "step": 1045 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.4218383182542466, + "learning_rate": 8.621960168033867e-05, + "loss": 0.6723, + "step": 1046 + }, + { + "epoch": 0.5584, + "grad_norm": 0.4803868703274385, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6888, + "step": 1047 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.41499017197056104, + "learning_rate": 8.587737219227462e-05, + "loss": 0.6625, + "step": 1048 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.4758356386087854, + "learning_rate": 8.570632045125185e-05, + "loss": 0.691, + "step": 1049 + }, + { + "epoch": 0.56, + "grad_norm": 0.3575237560348443, + "learning_rate": 8.553531139332582e-05, + "loss": 0.683, + "step": 1050 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.4191047803710979, + "learning_rate": 8.536434552915556e-05, + "loss": 0.7114, + "step": 1051 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.4416907587765159, + "learning_rate": 8.519342336927105e-05, + "loss": 0.7463, + "step": 1052 + }, + { + "epoch": 0.5616, + "grad_norm": 0.45971802278763857, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7283, + "step": 1053 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.39686142791469187, + "learning_rate": 8.485171220382545e-05, + "loss": 0.693, + "step": 1054 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.40324297018751654, + "learning_rate": 8.468092421866573e-05, + "loss": 0.6466, + "step": 1055 + }, + { + "epoch": 0.5632, + "grad_norm": 0.40873675796188375, + "learning_rate": 8.451018197859153e-05, + "loss": 0.679, + "step": 1056 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.3648416970361638, + "learning_rate": 8.433948599346516e-05, + "loss": 0.6645, + "step": 1057 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.4037285077267891, + "learning_rate": 8.416883677301069e-05, + "loss": 0.6495, + "step": 1058 + }, + { + "epoch": 0.5648, + "grad_norm": 0.36521119192743473, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6472, + "step": 1059 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.4243778904952702, + "learning_rate": 8.382768066431425e-05, + "loss": 0.6567, + "step": 1060 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.42523133489332654, + "learning_rate": 8.36571747948162e-05, + "loss": 0.733, + "step": 1061 + }, + { + "epoch": 0.5664, + "grad_norm": 0.415875245008936, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6837, + "step": 1062 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.40115003336266325, + "learning_rate": 8.33163099713009e-05, + "loss": 0.6847, + "step": 1063 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.32881616896424515, + "learning_rate": 8.31459520351578e-05, + "loss": 0.6213, + "step": 1064 + }, + { + "epoch": 0.568, + "grad_norm": 0.4275860836489642, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6188, + "step": 1065 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.3859053081623988, + "learning_rate": 8.280538765767235e-05, + "loss": 0.6332, + "step": 1066 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.412578914524153, + "learning_rate": 8.263518223330697e-05, + "loss": 0.6908, + "step": 1067 + }, + { + "epoch": 0.5696, + "grad_norm": 0.375431620235667, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6565, + "step": 1068 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.3890627334157619, + "learning_rate": 8.22949274546255e-05, + "loss": 0.6683, + "step": 1069 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.4362939883861216, + "learning_rate": 8.212487911636184e-05, + "loss": 0.6673, + "step": 1070 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3848179229246365, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6791, + "step": 1071 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.4161599348707075, + "learning_rate": 8.178494308093789e-05, + "loss": 0.6738, + "step": 1072 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.39481859626708815, + "learning_rate": 8.161505639887817e-05, + "loss": 0.6776, + "step": 1073 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4790895542819692, + "learning_rate": 8.144522461705067e-05, + "loss": 0.7418, + "step": 1074 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.38934276938196133, + "learning_rate": 8.127544824259889e-05, + "loss": 0.6913, + "step": 1075 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.3996181990520993, + "learning_rate": 8.110572778250085e-05, + "loss": 0.69, + "step": 1076 + }, + { + "epoch": 0.5744, + "grad_norm": 0.4016846160311755, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7222, + "step": 1077 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.40645679452658495, + "learning_rate": 8.076645663244168e-05, + "loss": 0.6975, + "step": 1078 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.3821026612501412, + "learning_rate": 8.059690695559568e-05, + "loss": 0.6751, + "step": 1079 + }, + { + "epoch": 0.576, + "grad_norm": 0.43526932412856223, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6802, + "step": 1080 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.47893887272111735, + "learning_rate": 8.025798192977481e-05, + "loss": 0.7243, + "step": 1081 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.4016805709957415, + "learning_rate": 8.008860759288147e-05, + "loss": 0.6123, + "step": 1082 + }, + { + "epoch": 0.5776, + "grad_norm": 0.3790105526008244, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6645, + "step": 1083 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.4235556005744978, + "learning_rate": 7.975003780001485e-05, + "loss": 0.6937, + "step": 1084 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.3929143639351382, + "learning_rate": 7.958084335506239e-05, + "loss": 0.6306, + "step": 1085 + }, + { + "epoch": 0.5792, + "grad_norm": 0.4817146393427374, + "learning_rate": 7.941170988481108e-05, + "loss": 0.8412, + "step": 1086 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.41991508462058535, + "learning_rate": 7.924263789431912e-05, + "loss": 0.6699, + "step": 1087 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.3680084098297583, + "learning_rate": 7.907362788846116e-05, + "loss": 0.6181, + "step": 1088 + }, + { + "epoch": 0.5808, + "grad_norm": 0.36386174429490653, + "learning_rate": 7.89046803719267e-05, + "loss": 0.619, + "step": 1089 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.4016558656284791, + "learning_rate": 7.873579584921869e-05, + "loss": 0.6795, + "step": 1090 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.37460452727953286, + "learning_rate": 7.856697482465196e-05, + "loss": 0.6657, + "step": 1091 + }, + { + "epoch": 0.5824, + "grad_norm": 0.38628545562440836, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6926, + "step": 1092 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.35647816512228697, + "learning_rate": 7.822952528625191e-05, + "loss": 0.6465, + "step": 1093 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.3980489861012932, + "learning_rate": 7.806089778009421e-05, + "loss": 0.6853, + "step": 1094 + }, + { + "epoch": 0.584, + "grad_norm": 0.38071801407450945, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6694, + "step": 1095 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.4008725472437602, + "learning_rate": 7.772383981159849e-05, + "loss": 0.7112, + "step": 1096 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.37430962526654193, + "learning_rate": 7.755541035576677e-05, + "loss": 0.6208, + "step": 1097 + }, + { + "epoch": 0.5856, + "grad_norm": 0.39821374623112504, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6364, + "step": 1098 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.4234024743538159, + "learning_rate": 7.721875301571359e-05, + "loss": 0.7342, + "step": 1099 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.38635466588685263, + "learning_rate": 7.705052613680211e-05, + "loss": 0.6606, + "step": 1100 + }, + { + "epoch": 0.5872, + "grad_norm": 0.4640882179165805, + "learning_rate": 7.688236778850306e-05, + "loss": 0.732, + "step": 1101 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.3587389095046982, + "learning_rate": 7.671427847296275e-05, + "loss": 0.6048, + "step": 1102 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.38280570307884154, + "learning_rate": 7.654625869212146e-05, + "loss": 0.6857, + "step": 1103 + }, + { + "epoch": 0.5888, + "grad_norm": 0.385190815200957, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6507, + "step": 1104 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.39172766294738226, + "learning_rate": 7.6210429741257e-05, + "loss": 0.6592, + "step": 1105 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.4343325420045918, + "learning_rate": 7.604262157407007e-05, + "loss": 0.6245, + "step": 1106 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3795685669434547, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6433, + "step": 1107 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.46458247504413547, + "learning_rate": 7.570722036168854e-05, + "loss": 0.7326, + "step": 1108 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.35247844174059123, + "learning_rate": 7.55396283180529e-05, + "loss": 0.5951, + "step": 1109 + }, + { + "epoch": 0.592, + "grad_norm": 0.35314968631174554, + "learning_rate": 7.537210931679987e-05, + "loss": 0.604, + "step": 1110 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.397353578219303, + "learning_rate": 7.520466385816671e-05, + "loss": 0.6967, + "step": 1111 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.5232148536867158, + "learning_rate": 7.503729244217086e-05, + "loss": 0.8047, + "step": 1112 + }, + { + "epoch": 0.5936, + "grad_norm": 0.35913464570859693, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6145, + "step": 1113 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.38235743410072365, + "learning_rate": 7.470277373705461e-05, + "loss": 0.6406, + "step": 1114 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.41386567999451707, + "learning_rate": 7.453562744685778e-05, + "loss": 0.7543, + "step": 1115 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3751622819161369, + "learning_rate": 7.43685571971426e-05, + "loss": 0.705, + "step": 1116 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.43269323278603933, + "learning_rate": 7.42015634868062e-05, + "loss": 0.7224, + "step": 1117 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.38350270002476794, + "learning_rate": 7.403464681451715e-05, + "loss": 0.6654, + "step": 1118 + }, + { + "epoch": 0.5968, + "grad_norm": 0.39103904708593545, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6514, + "step": 1119 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.3750082436334888, + "learning_rate": 7.370104657760361e-05, + "loss": 0.6194, + "step": 1120 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.44970376864398026, + "learning_rate": 7.353436400916004e-05, + "loss": 0.7271, + "step": 1121 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4957667572030496, + "learning_rate": 7.336776047112276e-05, + "loss": 0.7052, + "step": 1122 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.35200520111723355, + "learning_rate": 7.320123646099519e-05, + "loss": 0.581, + "step": 1123 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.3804936623120466, + "learning_rate": 7.303479247604332e-05, + "loss": 0.5876, + "step": 1124 + }, + { + "epoch": 0.6, + "grad_norm": 0.41104529446282756, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6924, + "step": 1125 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.45170797450398764, + "learning_rate": 7.270214656953415e-05, + "loss": 0.653, + "step": 1126 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.38503923675228885, + "learning_rate": 7.253594564130804e-05, + "loss": 0.6022, + "step": 1127 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4633595168656192, + "learning_rate": 7.236982672491698e-05, + "loss": 0.7056, + "step": 1128 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.4661926119636266, + "learning_rate": 7.22037903164173e-05, + "loss": 0.7071, + "step": 1129 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.39058116202458776, + "learning_rate": 7.203783691161883e-05, + "loss": 0.6357, + "step": 1130 + }, + { + "epoch": 0.6032, + "grad_norm": 0.4068710100692547, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6284, + "step": 1131 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.36093967832027174, + "learning_rate": 7.170618109512465e-05, + "loss": 0.6628, + "step": 1132 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.3754030586808087, + "learning_rate": 7.154047967380354e-05, + "loss": 0.6962, + "step": 1133 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3601575344266386, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6535, + "step": 1134 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.42508960170227306, + "learning_rate": 7.12093322790597e-05, + "loss": 0.6896, + "step": 1135 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.378594427641715, + "learning_rate": 7.104388729449338e-05, + "loss": 0.6055, + "step": 1136 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3924478605875361, + "learning_rate": 7.087852877727481e-05, + "loss": 0.687, + "step": 1137 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.386931567989563, + "learning_rate": 7.071325722118963e-05, + "loss": 0.6785, + "step": 1138 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.40036626592390673, + "learning_rate": 7.054807311976379e-05, + "loss": 0.6768, + "step": 1139 + }, + { + "epoch": 0.608, + "grad_norm": 0.45732859872622206, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7172, + "step": 1140 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.35181903750732224, + "learning_rate": 7.021796925368667e-05, + "loss": 0.6021, + "step": 1141 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.34220642484795927, + "learning_rate": 7.005305047477566e-05, + "loss": 0.6, + "step": 1142 + }, + { + "epoch": 0.6096, + "grad_norm": 0.37763765261431453, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6134, + "step": 1143 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.42213695261788436, + "learning_rate": 6.972348168756983e-05, + "loss": 0.7144, + "step": 1144 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.3559131773085692, + "learning_rate": 6.955883266341741e-05, + "loss": 0.6105, + "step": 1145 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5344280420368293, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6698, + "step": 1146 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.4200921056379506, + "learning_rate": 6.922980781234699e-05, + "loss": 0.71, + "step": 1147 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.3696536574973899, + "learning_rate": 6.906543296794714e-05, + "loss": 0.6535, + "step": 1148 + }, + { + "epoch": 0.6128, + "grad_norm": 0.39741208691876595, + "learning_rate": 6.890115049885994e-05, + "loss": 0.706, + "step": 1149 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.38779623938426, + "learning_rate": 6.873696089565786e-05, + "loss": 0.6639, + "step": 1150 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.3977903235022171, + "learning_rate": 6.85728646486359e-05, + "loss": 0.6886, + "step": 1151 + }, + { + "epoch": 0.6144, + "grad_norm": 0.39730445921698443, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6592, + "step": 1152 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.403564736270205, + "learning_rate": 6.82449541829174e-05, + "loss": 0.7242, + "step": 1153 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.4110931452082343, + "learning_rate": 6.80811409434113e-05, + "loss": 0.6931, + "step": 1154 + }, + { + "epoch": 0.616, + "grad_norm": 0.3996311263746934, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6377, + "step": 1155 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.39032809008553926, + "learning_rate": 6.775380089695986e-05, + "loss": 0.7247, + "step": 1156 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.41106679109628125, + "learning_rate": 6.759027506750158e-05, + "loss": 0.7073, + "step": 1157 + }, + { + "epoch": 0.6176, + "grad_norm": 0.4236171414493968, + "learning_rate": 6.742684601840141e-05, + "loss": 0.7755, + "step": 1158 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.41627136156520544, + "learning_rate": 6.726351423768322e-05, + "loss": 0.6851, + "step": 1159 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.37047575479258393, + "learning_rate": 6.710028021308061e-05, + "loss": 0.7144, + "step": 1160 + }, + { + "epoch": 0.6192, + "grad_norm": 0.40065107124430077, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6531, + "step": 1161 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.37739932655170266, + "learning_rate": 6.677410738169485e-05, + "loss": 0.6755, + "step": 1162 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.36182734161073676, + "learning_rate": 6.661116954891328e-05, + "loss": 0.6477, + "step": 1163 + }, + { + "epoch": 0.6208, + "grad_norm": 0.41099623638967697, + "learning_rate": 6.644833142024751e-05, + "loss": 0.634, + "step": 1164 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.401626466947384, + "learning_rate": 6.62855934819569e-05, + "loss": 0.691, + "step": 1165 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.36959968335005, + "learning_rate": 6.612295622000162e-05, + "loss": 0.6895, + "step": 1166 + }, + { + "epoch": 0.6224, + "grad_norm": 0.4031099866728893, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6504, + "step": 1167 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.3712238547650622, + "learning_rate": 6.579798566743314e-05, + "loss": 0.667, + "step": 1168 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.38921717995057287, + "learning_rate": 6.563565334723134e-05, + "loss": 0.6143, + "step": 1169 + }, + { + "epoch": 0.624, + "grad_norm": 0.40796849123557527, + "learning_rate": 6.547342364418481e-05, + "loss": 0.6796, + "step": 1170 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.3386381334707545, + "learning_rate": 6.531129704273604e-05, + "loss": 0.603, + "step": 1171 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.41218625274857573, + "learning_rate": 6.514927402701964e-05, + "loss": 0.6996, + "step": 1172 + }, + { + "epoch": 0.6256, + "grad_norm": 0.41575269185267494, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6846, + "step": 1173 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.370757705379407, + "learning_rate": 6.48255406877745e-05, + "loss": 0.6253, + "step": 1174 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.3498429362561111, + "learning_rate": 6.466383133096267e-05, + "loss": 0.582, + "step": 1175 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3967937924943134, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6348, + "step": 1176 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.46531867946137895, + "learning_rate": 6.434072965740242e-05, + "loss": 0.7073, + "step": 1177 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.37955040353422276, + "learning_rate": 6.417933830548467e-05, + "loss": 0.675, + "step": 1178 + }, + { + "epoch": 0.6288, + "grad_norm": 0.3543836601917388, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6189, + "step": 1179 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.40689125569370455, + "learning_rate": 6.385687698106781e-05, + "loss": 0.6555, + "step": 1180 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.4163295050376555, + "learning_rate": 6.369580797148718e-05, + "loss": 0.6658, + "step": 1181 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4969283756813846, + "learning_rate": 6.35348473717345e-05, + "loss": 0.675, + "step": 1182 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.44524254611823577, + "learning_rate": 6.337399566246257e-05, + "loss": 0.7528, + "step": 1183 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.37316751405405507, + "learning_rate": 6.321325332399903e-05, + "loss": 0.6495, + "step": 1184 + }, + { + "epoch": 0.632, + "grad_norm": 0.36577387173292003, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6217, + "step": 1185 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.441330122528069, + "learning_rate": 6.289209867917312e-05, + "loss": 0.6822, + "step": 1186 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.4303781965223265, + "learning_rate": 6.273168733182722e-05, + "loss": 0.6586, + "step": 1187 + }, + { + "epoch": 0.6336, + "grad_norm": 0.4104655155426858, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6503, + "step": 1188 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.3990485931055677, + "learning_rate": 6.241119898233144e-05, + "loss": 0.6871, + "step": 1189 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.3887220063485232, + "learning_rate": 6.225112293720836e-05, + "loss": 0.6743, + "step": 1190 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3828225414927008, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6497, + "step": 1191 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.39661169755538084, + "learning_rate": 6.19313094962673e-05, + "loss": 0.6359, + "step": 1192 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.43297412058206247, + "learning_rate": 6.177157305546078e-05, + "loss": 0.6677, + "step": 1193 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4346435679710135, + "learning_rate": 6.161195077053976e-05, + "loss": 0.7041, + "step": 1194 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.3787375488050026, + "learning_rate": 6.145244311816063e-05, + "loss": 0.6975, + "step": 1195 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.39924033430802885, + "learning_rate": 6.129305057463741e-05, + "loss": 0.6915, + "step": 1196 + }, + { + "epoch": 0.6384, + "grad_norm": 0.35882244776870287, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6333, + "step": 1197 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.3892922219688357, + "learning_rate": 6.0974612717695004e-05, + "loss": 0.6575, + "step": 1198 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.4025711918252862, + "learning_rate": 6.0815568355179556e-05, + "loss": 0.6607, + "step": 1199 + }, + { + "epoch": 0.64, + "grad_norm": 0.39686322723674156, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6663, + "step": 1200 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.38696545733015086, + "learning_rate": 6.0497831136711836e-05, + "loss": 0.6851, + "step": 1201 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.4101242906783707, + "learning_rate": 6.0339139229571116e-05, + "loss": 0.7008, + "step": 1202 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4016275859970521, + "learning_rate": 6.018056575578075e-05, + "loss": 0.626, + "step": 1203 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.4545316434527366, + "learning_rate": 6.002211118886514e-05, + "loss": 0.6483, + "step": 1204 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.38345074027203224, + "learning_rate": 5.986377600199371e-05, + "loss": 0.6432, + "step": 1205 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4479961188016446, + "learning_rate": 5.970556066797941e-05, + "loss": 0.7402, + "step": 1206 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.4459207817163863, + "learning_rate": 5.9547465659277215e-05, + "loss": 0.7431, + "step": 1207 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.4319954107698843, + "learning_rate": 5.938949144798279e-05, + "loss": 0.722, + "step": 1208 + }, + { + "epoch": 0.6448, + "grad_norm": 0.38149712675860425, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6703, + "step": 1209 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.37791425530410894, + "learning_rate": 5.907390730419507e-05, + "loss": 0.6656, + "step": 1210 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.4121481150338273, + "learning_rate": 5.8916298314083915e-05, + "loss": 0.651, + "step": 1211 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4169291524422402, + "learning_rate": 5.875881200614207e-05, + "loss": 0.7033, + "step": 1212 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.42033859419007247, + "learning_rate": 5.860144885064751e-05, + "loss": 0.6443, + "step": 1213 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.3700272313497562, + "learning_rate": 5.8444209317510514e-05, + "loss": 0.6601, + "step": 1214 + }, + { + "epoch": 0.648, + "grad_norm": 0.4778802988426458, + "learning_rate": 5.828709387627218e-05, + "loss": 0.697, + "step": 1215 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.35768936042185684, + "learning_rate": 5.813010299610313e-05, + "loss": 0.6168, + "step": 1216 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.36800043178590347, + "learning_rate": 5.797323714580192e-05, + "loss": 0.6457, + "step": 1217 + }, + { + "epoch": 0.6496, + "grad_norm": 0.40303810625934766, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7043, + "step": 1218 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.43424053966722104, + "learning_rate": 5.765988240812921e-05, + "loss": 0.7209, + "step": 1219 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.3793422373656162, + "learning_rate": 5.750339445648252e-05, + "loss": 0.67, + "step": 1220 + }, + { + "epoch": 0.6512, + "grad_norm": 0.39844209797716984, + "learning_rate": 5.73470334061505e-05, + "loss": 0.7021, + "step": 1221 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.3722643921892602, + "learning_rate": 5.7190799724050924e-05, + "loss": 0.6377, + "step": 1222 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.4133785282213768, + "learning_rate": 5.7034693876721376e-05, + "loss": 0.6525, + "step": 1223 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5129186669636846, + "learning_rate": 5.687871633031754e-05, + "loss": 0.726, + "step": 1224 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.456742676325474, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.7273, + "step": 1225 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.3600078285545767, + "learning_rate": 5.6567148002993164e-05, + "loss": 0.6206, + "step": 1226 + }, + { + "epoch": 0.6544, + "grad_norm": 0.43111287342009397, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.7103, + "step": 1227 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.3626978793288534, + "learning_rate": 5.625609846363622e-05, + "loss": 0.6129, + "step": 1228 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.4343160084912167, + "learning_rate": 5.6100769400739383e-05, + "loss": 0.7009, + "step": 1229 + }, + { + "epoch": 0.656, + "grad_norm": 0.4555172930959856, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7079, + "step": 1230 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.40227511524808995, + "learning_rate": 5.579050500768836e-05, + "loss": 0.6994, + "step": 1231 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.48632666387981066, + "learning_rate": 5.5635570604030705e-05, + "loss": 0.696, + "step": 1232 + }, + { + "epoch": 0.6576, + "grad_norm": 0.42581739779144323, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6743, + "step": 1233 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.3825639992276115, + "learning_rate": 5.53260996957381e-05, + "loss": 0.6205, + "step": 1234 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.3793282181515819, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.6942, + "step": 1235 + }, + { + "epoch": 0.6592, + "grad_norm": 0.571196272629039, + "learning_rate": 5.501716239923642e-05, + "loss": 0.7054, + "step": 1236 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.3593515575564961, + "learning_rate": 5.486289500882355e-05, + "loss": 0.6245, + "step": 1237 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.3768362851791814, + "learning_rate": 5.47087624046575e-05, + "loss": 0.6837, + "step": 1238 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4416232089824355, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6785, + "step": 1239 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.37772676319392723, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.6278, + "step": 1240 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.43467731327877024, + "learning_rate": 5.424717791025302e-05, + "loss": 0.7057, + "step": 1241 + }, + { + "epoch": 0.6624, + "grad_norm": 0.3904238371216619, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6817, + "step": 1242 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.3882791634332464, + "learning_rate": 5.394013727258254e-05, + "loss": 0.6684, + "step": 1243 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.43935029585606616, + "learning_rate": 5.378682303724435e-05, + "loss": 0.6546, + "step": 1244 + }, + { + "epoch": 0.664, + "grad_norm": 0.40037159051848714, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6343, + "step": 1245 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.4557479411637336, + "learning_rate": 5.348060902265871e-05, + "loss": 0.7471, + "step": 1246 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.38531054077996774, + "learning_rate": 5.332771015781275e-05, + "loss": 0.6582, + "step": 1247 + }, + { + "epoch": 0.6656, + "grad_norm": 0.38429972181306354, + "learning_rate": 5.31749506635086e-05, + "loss": 0.698, + "step": 1248 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.43581105469606396, + "learning_rate": 5.302233099590928e-05, + "loss": 0.7503, + "step": 1249 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.3786961249187762, + "learning_rate": 5.286985161076029e-05, + "loss": 0.6616, + "step": 1250 + }, + { + "epoch": 0.6672, + "grad_norm": 0.36814280572850133, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6111, + "step": 1251 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.35289894291087465, + "learning_rate": 5.2565315508699376e-05, + "loss": 0.6721, + "step": 1252 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.3788897774951766, + "learning_rate": 5.2413259701178505e-05, + "loss": 0.6638, + "step": 1253 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3843078947868329, + "learning_rate": 5.226134599488728e-05, + "loss": 0.7047, + "step": 1254 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.3956161994374667, + "learning_rate": 5.210957484346314e-05, + "loss": 0.6583, + "step": 1255 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.4324262603703905, + "learning_rate": 5.195794670011776e-05, + "loss": 0.6834, + "step": 1256 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4286966243205002, + "learning_rate": 5.180646201763577e-05, + "loss": 0.696, + "step": 1257 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.3630002637808542, + "learning_rate": 5.165512124837344e-05, + "loss": 0.6502, + "step": 1258 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.38035823271916014, + "learning_rate": 5.150392484425728e-05, + "loss": 0.5918, + "step": 1259 + }, + { + "epoch": 0.672, + "grad_norm": 0.4576968077355702, + "learning_rate": 5.135287325678271e-05, + "loss": 0.686, + "step": 1260 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.36880792848008637, + "learning_rate": 5.120196693701267e-05, + "loss": 0.6668, + "step": 1261 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.3679974050882043, + "learning_rate": 5.105120633557634e-05, + "loss": 0.6546, + "step": 1262 + }, + { + "epoch": 0.6736, + "grad_norm": 0.4131694246907738, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6652, + "step": 1263 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.4493186583002729, + "learning_rate": 5.075012408804458e-05, + "loss": 0.7106, + "step": 1264 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.4208962131856972, + "learning_rate": 5.059980334102637e-05, + "loss": 0.6626, + "step": 1265 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3585481968909659, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6645, + "step": 1266 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.35082005531937965, + "learning_rate": 5.0299604844886985e-05, + "loss": 0.6022, + "step": 1267 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.38486451673856137, + "learning_rate": 5.014972799220403e-05, + "loss": 0.6073, + "step": 1268 + }, + { + "epoch": 0.6768, + "grad_norm": 0.41165383955462015, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6918, + "step": 1269 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.3712772782434978, + "learning_rate": 4.985042131538545e-05, + "loss": 0.5546, + "step": 1270 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.39827766286989835, + "learning_rate": 4.9700992385024934e-05, + "loss": 0.6296, + "step": 1271 + }, + { + "epoch": 0.6784, + "grad_norm": 0.44595990018200404, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6926, + "step": 1272 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.4230801733220779, + "learning_rate": 4.940258557148765e-05, + "loss": 0.642, + "step": 1273 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.536142732112756, + "learning_rate": 4.9253608579398855e-05, + "loss": 0.7798, + "step": 1274 + }, + { + "epoch": 0.68, + "grad_norm": 0.36332509710940986, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6313, + "step": 1275 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.40031407367425914, + "learning_rate": 4.895610964891923e-05, + "loss": 0.6427, + "step": 1276 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.40293063394952483, + "learning_rate": 4.880758859890536e-05, + "loss": 0.6547, + "step": 1277 + }, + { + "epoch": 0.6816, + "grad_norm": 0.39869455791686215, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6398, + "step": 1278 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.48883387350145335, + "learning_rate": 4.851100554686021e-05, + "loss": 0.7496, + "step": 1279 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.3926469433108249, + "learning_rate": 4.836294443047088e-05, + "loss": 0.6801, + "step": 1280 + }, + { + "epoch": 0.6832, + "grad_norm": 0.4677723780348946, + "learning_rate": 4.821503751016746e-05, + "loss": 0.7409, + "step": 1281 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.43363420970950584, + "learning_rate": 4.8067285227622404e-05, + "loss": 0.649, + "step": 1282 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.4183026247555478, + "learning_rate": 4.791968802404648e-05, + "loss": 0.6925, + "step": 1283 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4405274776531298, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6791, + "step": 1284 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.3934240974563691, + "learning_rate": 4.762496061632814e-05, + "loss": 0.6386, + "step": 1285 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.4054706785769142, + "learning_rate": 4.747783129228656e-05, + "loss": 0.6828, + "step": 1286 + }, + { + "epoch": 0.6864, + "grad_norm": 0.37853453149151145, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6625, + "step": 1287 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.37513832966085037, + "learning_rate": 4.718404360058966e-05, + "loss": 0.6342, + "step": 1288 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.4023934982278683, + "learning_rate": 4.7037386110228985e-05, + "loss": 0.7325, + "step": 1289 + }, + { + "epoch": 0.688, + "grad_norm": 0.36899840695902886, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6281, + "step": 1290 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.4033113443179532, + "learning_rate": 4.6744546030189486e-05, + "loss": 0.7099, + "step": 1291 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.3171488557975274, + "learning_rate": 4.659836431497563e-05, + "loss": 0.5754, + "step": 1292 + }, + { + "epoch": 0.6896, + "grad_norm": 0.41542117458614175, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6905, + "step": 1293 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.39437793112096853, + "learning_rate": 4.630647971676232e-05, + "loss": 0.6204, + "step": 1294 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.3553818449896893, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.6217, + "step": 1295 + }, + { + "epoch": 0.6912, + "grad_norm": 0.40995002407888353, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.665, + "step": 1296 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.36100741046680696, + "learning_rate": 4.586985643347717e-05, + "loss": 0.616, + "step": 1297 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.37818239833090767, + "learning_rate": 4.572463804170263e-05, + "loss": 0.6271, + "step": 1298 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3776610659012665, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6637, + "step": 1299 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.4231154654286901, + "learning_rate": 4.543468791472131e-05, + "loss": 0.6329, + "step": 1300 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.42936741925917193, + "learning_rate": 4.5289957045349653e-05, + "loss": 0.6581, + "step": 1301 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3923600873659972, + "learning_rate": 4.514538954847064e-05, + "loss": 0.65, + "step": 1302 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.3935245359603157, + "learning_rate": 4.5000985855784746e-05, + "loss": 0.6698, + "step": 1303 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.3672090508252383, + "learning_rate": 4.485674639850333e-05, + "loss": 0.6356, + "step": 1304 + }, + { + "epoch": 0.696, + "grad_norm": 0.38870391355750356, + "learning_rate": 4.471267160734731e-05, + "loss": 0.659, + "step": 1305 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.4307327798360442, + "learning_rate": 4.456876191254582e-05, + "loss": 0.7335, + "step": 1306 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.44166475115987996, + "learning_rate": 4.442501774383515e-05, + "loss": 0.6762, + "step": 1307 + }, + { + "epoch": 0.6976, + "grad_norm": 0.36626683468551186, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6232, + "step": 1308 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.40103874478481943, + "learning_rate": 4.413802770115816e-05, + "loss": 0.6844, + "step": 1309 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.3949961307614567, + "learning_rate": 4.399478268418771e-05, + "loss": 0.6512, + "step": 1310 + }, + { + "epoch": 0.6992, + "grad_norm": 0.4351564575723422, + "learning_rate": 4.385170490729712e-05, + "loss": 0.7587, + "step": 1311 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.38572628275672344, + "learning_rate": 4.3708794797738375e-05, + "loss": 0.6679, + "step": 1312 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.36538503676275325, + "learning_rate": 4.3566052782262735e-05, + "loss": 0.6095, + "step": 1313 + }, + { + "epoch": 0.7008, + "grad_norm": 0.38274039879275157, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6448, + "step": 1314 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.44275866152143434, + "learning_rate": 4.328107473805487e-05, + "loss": 0.6761, + "step": 1315 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.37872154665111507, + "learning_rate": 4.3138839560310303e-05, + "loss": 0.6458, + "step": 1316 + }, + { + "epoch": 0.7024, + "grad_norm": 0.41637145712821355, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6847, + "step": 1317 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.3731937878897712, + "learning_rate": 4.2854879017217894e-05, + "loss": 0.6247, + "step": 1318 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.4131083805194796, + "learning_rate": 4.271315449981934e-05, + "loss": 0.6178, + "step": 1319 + }, + { + "epoch": 0.704, + "grad_norm": 0.41315094556072635, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7103, + "step": 1320 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.3946508996007053, + "learning_rate": 4.2430219089370823e-05, + "loss": 0.6068, + "step": 1321 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.4133597282234054, + "learning_rate": 4.228900904120895e-05, + "loss": 0.6382, + "step": 1322 + }, + { + "epoch": 0.7056, + "grad_norm": 0.36438538202343185, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6008, + "step": 1323 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.35554634006170616, + "learning_rate": 4.200710636738189e-05, + "loss": 0.6025, + "step": 1324 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.39559354992693413, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.6592, + "step": 1325 + }, + { + "epoch": 0.7072, + "grad_norm": 0.40637796318098757, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6341, + "step": 1326 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.3844034545296773, + "learning_rate": 4.158555222253771e-05, + "loss": 0.6626, + "step": 1327 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.3649619850541277, + "learning_rate": 4.14453824841132e-05, + "loss": 0.5766, + "step": 1328 + }, + { + "epoch": 0.7088, + "grad_norm": 0.499463231063169, + "learning_rate": 4.130538759866457e-05, + "loss": 0.7875, + "step": 1329 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.3661031272438177, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.6066, + "step": 1330 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.36741709928338706, + "learning_rate": 4.102592405835536e-05, + "loss": 0.6208, + "step": 1331 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4284659517016139, + "learning_rate": 4.088645623801534e-05, + "loss": 0.7017, + "step": 1332 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.4161922080822117, + "learning_rate": 4.074716493968975e-05, + "loss": 0.7325, + "step": 1333 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.38075809983273384, + "learning_rate": 4.060805057932359e-05, + "loss": 0.6238, + "step": 1334 + }, + { + "epoch": 0.712, + "grad_norm": 0.39114450256771044, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6198, + "step": 1335 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.45547674998004817, + "learning_rate": 4.0330354333606234e-05, + "loss": 0.6436, + "step": 1336 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.4032049645290277, + "learning_rate": 4.019177327749822e-05, + "loss": 0.6714, + "step": 1337 + }, + { + "epoch": 0.7136, + "grad_norm": 0.37367310708375145, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6155, + "step": 1338 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.3742593122537589, + "learning_rate": 3.991514736790258e-05, + "loss": 0.6209, + "step": 1339 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.3949703021201749, + "learning_rate": 3.977710334046193e-05, + "loss": 0.6261, + "step": 1340 + }, + { + "epoch": 0.7152, + "grad_norm": 0.4670628457168731, + "learning_rate": 3.963923914773187e-05, + "loss": 0.6323, + "step": 1341 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.472725466215836, + "learning_rate": 3.950155520139581e-05, + "loss": 0.7601, + "step": 1342 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.4123296577584803, + "learning_rate": 3.936405191259891e-05, + "loss": 0.6882, + "step": 1343 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3902995937508573, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6124, + "step": 1344 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.4432532967320152, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.7045, + "step": 1345 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.32557319256599393, + "learning_rate": 3.895263009479534e-05, + "loss": 0.6022, + "step": 1346 + }, + { + "epoch": 0.7184, + "grad_norm": 0.44612344737641824, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7007, + "step": 1347 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.400274471628821, + "learning_rate": 3.867925968395085e-05, + "loss": 0.624, + "step": 1348 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.4347762917189781, + "learning_rate": 3.854284894414122e-05, + "loss": 0.659, + "step": 1349 + }, + { + "epoch": 0.72, + "grad_norm": 0.3528563922408139, + "learning_rate": 3.840662172471315e-05, + "loss": 0.5884, + "step": 1350 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.41839505115370795, + "learning_rate": 3.82705784324618e-05, + "loss": 0.6532, + "step": 1351 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.38938423552045953, + "learning_rate": 3.8134719473633094e-05, + "loss": 0.682, + "step": 1352 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3649377142914851, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6228, + "step": 1353 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.44484447899883567, + "learning_rate": 3.786355617847385e-05, + "loss": 0.6934, + "step": 1354 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.4348795155737933, + "learning_rate": 3.772825265187802e-05, + "loss": 0.6532, + "step": 1355 + }, + { + "epoch": 0.7232, + "grad_norm": 0.4035943809385256, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6405, + "step": 1356 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.41145186927193345, + "learning_rate": 3.7458203860837234e-05, + "loss": 0.6327, + "step": 1357 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.35660941883562153, + "learning_rate": 3.732345940279893e-05, + "loss": 0.5593, + "step": 1358 + }, + { + "epoch": 0.7248, + "grad_norm": 0.348782187361037, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6143, + "step": 1359 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.35229758190089866, + "learning_rate": 3.705453237352227e-05, + "loss": 0.5757, + "step": 1360 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.3766708379067481, + "learning_rate": 3.692035060534088e-05, + "loss": 0.6348, + "step": 1361 + }, + { + "epoch": 0.7264, + "grad_norm": 0.41844607847834725, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6439, + "step": 1362 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.4239525619191792, + "learning_rate": 3.665255256532638e-05, + "loss": 0.7034, + "step": 1363 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.36214835153068375, + "learning_rate": 3.651893709317887e-05, + "loss": 0.6366, + "step": 1364 + }, + { + "epoch": 0.728, + "grad_norm": 0.3698420668311701, + "learning_rate": 3.638551118512089e-05, + "loss": 0.5886, + "step": 1365 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.37665993344657184, + "learning_rate": 3.625227523958252e-05, + "loss": 0.5874, + "step": 1366 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.3992543240402163, + "learning_rate": 3.611922965442648e-05, + "loss": 0.5887, + "step": 1367 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4249390586361677, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.7423, + "step": 1368 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.3890548320054069, + "learning_rate": 3.5853711153868965e-05, + "loss": 0.6296, + "step": 1369 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.4369503478017524, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.778, + "step": 1370 + }, + { + "epoch": 0.7312, + "grad_norm": 0.44242209088136397, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6808, + "step": 1371 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.3796958428239579, + "learning_rate": 3.545687101972013e-05, + "loss": 0.6024, + "step": 1372 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.46142542855877045, + "learning_rate": 3.53249759200601e-05, + "loss": 0.6607, + "step": 1373 + }, + { + "epoch": 0.7328, + "grad_norm": 0.38214993944754494, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6265, + "step": 1374 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.4108942588635796, + "learning_rate": 3.506176550233863e-05, + "loss": 0.6569, + "step": 1375 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.43362799075751884, + "learning_rate": 3.4930450970263485e-05, + "loss": 0.6686, + "step": 1376 + }, + { + "epoch": 0.7344, + "grad_norm": 0.44126989338498557, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6512, + "step": 1377 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.4185289731349431, + "learning_rate": 3.46684052203088e-05, + "loss": 0.6419, + "step": 1378 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.4734893958468771, + "learning_rate": 3.4537674784937614e-05, + "loss": 0.7045, + "step": 1379 + }, + { + "epoch": 0.736, + "grad_norm": 0.39623039295312157, + "learning_rate": 3.440713983000601e-05, + "loss": 0.6639, + "step": 1380 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.4535746586117549, + "learning_rate": 3.427680074531113e-05, + "loss": 0.7225, + "step": 1381 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.3591509021400412, + "learning_rate": 3.4146657920065285e-05, + "loss": 0.6195, + "step": 1382 + }, + { + "epoch": 0.7376, + "grad_norm": 0.371640842368074, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6137, + "step": 1383 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.4124128542580495, + "learning_rate": 3.388696260183832e-05, + "loss": 0.6272, + "step": 1384 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.40380791242840447, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.7468, + "step": 1385 + }, + { + "epoch": 0.7392, + "grad_norm": 0.357970181024765, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6093, + "step": 1386 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.41222039821290496, + "learning_rate": 3.3498901266912396e-05, + "loss": 0.6473, + "step": 1387 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.33234222419483217, + "learning_rate": 3.336994413891828e-05, + "loss": 0.5979, + "step": 1388 + }, + { + "epoch": 0.7408, + "grad_norm": 0.43062727956941377, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6807, + "step": 1389 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.43833492864942547, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.6377, + "step": 1390 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.4220942461833886, + "learning_rate": 3.298426809706928e-05, + "loss": 0.6912, + "step": 1391 + }, + { + "epoch": 0.7424, + "grad_norm": 0.43358978810760207, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6776, + "step": 1392 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.4329608656009232, + "learning_rate": 3.2728150691747115e-05, + "loss": 0.7258, + "step": 1393 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.48033728320925306, + "learning_rate": 3.2600393123964113e-05, + "loss": 0.6717, + "step": 1394 + }, + { + "epoch": 0.744, + "grad_norm": 0.3943080237724141, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.623, + "step": 1395 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.37952263078699766, + "learning_rate": 3.234548216567049e-05, + "loss": 0.6349, + "step": 1396 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.4329984014767168, + "learning_rate": 3.2218329536362704e-05, + "loss": 0.7061, + "step": 1397 + }, + { + "epoch": 0.7456, + "grad_norm": 0.4097847626974238, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6684, + "step": 1398 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.4261613454980889, + "learning_rate": 3.196463187590929e-05, + "loss": 0.7056, + "step": 1399 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.4096693428186534, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.7031, + "step": 1400 + }, + { + "epoch": 0.7472, + "grad_norm": 0.34480011480369654, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.5557, + "step": 1401 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.41176038512615737, + "learning_rate": 3.158561005793402e-05, + "loss": 0.682, + "step": 1402 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.42128821486930673, + "learning_rate": 3.145967754102691e-05, + "loss": 0.6459, + "step": 1403 + }, + { + "epoch": 0.7488, + "grad_norm": 0.4186312289389503, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6703, + "step": 1404 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.35582139540762747, + "learning_rate": 3.120842689807468e-05, + "loss": 0.5538, + "step": 1405 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.4594733836842397, + "learning_rate": 3.108310952230212e-05, + "loss": 0.732, + "step": 1406 + }, + { + "epoch": 0.7504, + "grad_norm": 0.3853264621047334, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6299, + "step": 1407 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.43192092754097916, + "learning_rate": 3.083309253324651e-05, + "loss": 0.6596, + "step": 1408 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.36266708068494263, + "learning_rate": 3.070839366655215e-05, + "loss": 0.6557, + "step": 1409 + }, + { + "epoch": 0.752, + "grad_norm": 0.3921456918480584, + "learning_rate": 3.058390171511196e-05, + "loss": 0.632, + "step": 1410 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.3682804707384396, + "learning_rate": 3.0459617050677868e-05, + "loss": 0.6162, + "step": 1411 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.40690924327241107, + "learning_rate": 3.0335540044382694e-05, + "loss": 0.6006, + "step": 1412 + }, + { + "epoch": 0.7536, + "grad_norm": 0.3757118918656615, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6087, + "step": 1413 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.4333706567952028, + "learning_rate": 3.008801048763914e-05, + "loss": 0.7058, + "step": 1414 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.441840561374695, + "learning_rate": 2.996455867635155e-05, + "loss": 0.6384, + "step": 1415 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3738993216053616, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6228, + "step": 1416 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.4066619644813712, + "learning_rate": 2.9718282831172883e-05, + "loss": 0.6187, + "step": 1417 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.4249750244231828, + "learning_rate": 2.9595459532698854e-05, + "loss": 0.6931, + "step": 1418 + }, + { + "epoch": 0.7568, + "grad_norm": 0.42388755795260064, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6541, + "step": 1419 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.4196924046795126, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.6737, + "step": 1420 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.3731126638907941, + "learning_rate": 2.922825253307947e-05, + "loss": 0.6032, + "step": 1421 + }, + { + "epoch": 0.7584, + "grad_norm": 0.380848844950075, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6107, + "step": 1422 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.41645384638764094, + "learning_rate": 2.898450393337977e-05, + "loss": 0.676, + "step": 1423 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.39056118434634635, + "learning_rate": 2.8862947546296315e-05, + "loss": 0.5893, + "step": 1424 + }, + { + "epoch": 0.76, + "grad_norm": 0.45382049037984407, + "learning_rate": 2.874160358524931e-05, + "loss": 0.735, + "step": 1425 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.3251308768590154, + "learning_rate": 2.8620472412590228e-05, + "loss": 0.574, + "step": 1426 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.5116432309743618, + "learning_rate": 2.8499554390035143e-05, + "loss": 0.6927, + "step": 1427 + }, + { + "epoch": 0.7616, + "grad_norm": 0.44674476604795116, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.7238, + "step": 1428 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.3882554952135879, + "learning_rate": 2.8258359238917665e-05, + "loss": 0.6211, + "step": 1429 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.5221203037694648, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.6923, + "step": 1430 + }, + { + "epoch": 0.7632, + "grad_norm": 0.36186865407806434, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6819, + "step": 1431 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.4094104968792091, + "learning_rate": 2.7898174144266732e-05, + "loss": 0.6534, + "step": 1432 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.46324553773117294, + "learning_rate": 2.7778542582653744e-05, + "loss": 0.6448, + "step": 1433 + }, + { + "epoch": 0.7648, + "grad_norm": 0.40996398425618485, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6775, + "step": 1434 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.39539878474081086, + "learning_rate": 2.753992680872457e-05, + "loss": 0.6818, + "step": 1435 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.3854112496279747, + "learning_rate": 2.7420943308951284e-05, + "loss": 0.6211, + "step": 1436 + }, + { + "epoch": 0.7664, + "grad_norm": 0.3775024747959915, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.5985, + "step": 1437 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.37930716053438596, + "learning_rate": 2.7183626860300247e-05, + "loss": 0.582, + "step": 1438 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.3676188176044859, + "learning_rate": 2.7065294620085424e-05, + "loss": 0.5915, + "step": 1439 + }, + { + "epoch": 0.768, + "grad_norm": 0.4024937254354364, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6574, + "step": 1440 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.44011655859671217, + "learning_rate": 2.6829283874666233e-05, + "loss": 0.6696, + "step": 1441 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.36464416554288664, + "learning_rate": 2.6711606074225782e-05, + "loss": 0.6397, + "step": 1442 + }, + { + "epoch": 0.7696, + "grad_norm": 0.4187645594144827, + "learning_rate": 2.659414712405398e-05, + "loss": 0.7247, + "step": 1443 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.3600886891680787, + "learning_rate": 2.647690737490106e-05, + "loss": 0.611, + "step": 1444 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.38212962826262514, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.6104, + "step": 1445 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3919783855846775, + "learning_rate": 2.6243086879379e-05, + "loss": 0.5956, + "step": 1446 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.3710565120206455, + "learning_rate": 2.6126506831233344e-05, + "loss": 0.6252, + "step": 1447 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.4447727487052963, + "learning_rate": 2.6010147380551475e-05, + "loss": 0.6349, + "step": 1448 + }, + { + "epoch": 0.7728, + "grad_norm": 0.40844341307921317, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6447, + "step": 1449 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.42002539746137924, + "learning_rate": 2.577809166078716e-05, + "loss": 0.6697, + "step": 1450 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.35688537235558, + "learning_rate": 2.566239608465838e-05, + "loss": 0.5776, + "step": 1451 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3849764676618137, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6557, + "step": 1452 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.40461391785601625, + "learning_rate": 2.543167122732918e-05, + "loss": 0.6619, + "step": 1453 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.407309234941252, + "learning_rate": 2.5316642635108244e-05, + "loss": 0.6472, + "step": 1454 + }, + { + "epoch": 0.776, + "grad_norm": 0.39665205227387035, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6505, + "step": 1455 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.421233040167664, + "learning_rate": 2.508725484101684e-05, + "loss": 0.6698, + "step": 1456 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.41035775157864474, + "learning_rate": 2.4972896324133144e-05, + "loss": 0.622, + "step": 1457 + }, + { + "epoch": 0.7776, + "grad_norm": 0.4418146090773731, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6442, + "step": 1458 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.5145318498415639, + "learning_rate": 2.4744851758148156e-05, + "loss": 0.6309, + "step": 1459 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.37263262406049175, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.6281, + "step": 1460 + }, + { + "epoch": 0.7792, + "grad_norm": 0.4264907048999342, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6345, + "step": 1461 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.37532832160156965, + "learning_rate": 2.4404471180913058e-05, + "loss": 0.6501, + "step": 1462 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.41906661417341445, + "learning_rate": 2.429146201687538e-05, + "loss": 0.6608, + "step": 1463 + }, + { + "epoch": 0.7808, + "grad_norm": 0.46520029369584237, + "learning_rate": 2.417867893002387e-05, + "loss": 0.7747, + "step": 1464 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.3878808565874205, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.6732, + "step": 1465 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.45584845508500854, + "learning_rate": 2.3953792334352787e-05, + "loss": 0.726, + "step": 1466 + }, + { + "epoch": 0.7824, + "grad_norm": 0.4060230858072229, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6515, + "step": 1467 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.3780041121096824, + "learning_rate": 2.3729814080079816e-05, + "loss": 0.6279, + "step": 1468 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.38533091717016743, + "learning_rate": 2.361816641743303e-05, + "loss": 0.6267, + "step": 1469 + }, + { + "epoch": 0.784, + "grad_norm": 0.41249480165369035, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6326, + "step": 1470 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.47986898298335223, + "learning_rate": 2.339555568810221e-05, + "loss": 0.6894, + "step": 1471 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.3562972128885994, + "learning_rate": 2.328459328616759e-05, + "loss": 0.6002, + "step": 1472 + }, + { + "epoch": 0.7856, + "grad_norm": 0.3926240308679547, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6376, + "step": 1473 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.399779778834616, + "learning_rate": 2.306335606451181e-05, + "loss": 0.6776, + "step": 1474 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.369862176791047, + "learning_rate": 2.295308190543859e-05, + "loss": 0.6489, + "step": 1475 + }, + { + "epoch": 0.7872, + "grad_norm": 0.40706635158485904, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.6555, + "step": 1476 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.4360060077790018, + "learning_rate": 2.2733224137277366e-05, + "loss": 0.6844, + "step": 1477 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.38720371081753013, + "learning_rate": 2.262364118471805e-05, + "loss": 0.644, + "step": 1478 + }, + { + "epoch": 0.7888, + "grad_norm": 0.42319056314765996, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6267, + "step": 1479 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.3604095302189909, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.6298, + "step": 1480 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.3925068211717366, + "learning_rate": 2.2296279977828337e-05, + "loss": 0.6511, + "step": 1481 + }, + { + "epoch": 0.7904, + "grad_norm": 0.41655114650776953, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6515, + "step": 1482 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.4770536152552729, + "learning_rate": 2.2079198805662914e-05, + "loss": 0.6991, + "step": 1483 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.4246596112012112, + "learning_rate": 2.1971007082704164e-05, + "loss": 0.7063, + "step": 1484 + }, + { + "epoch": 0.792, + "grad_norm": 0.4302795286271402, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.655, + "step": 1485 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.3676952748071041, + "learning_rate": 2.1755322978418137e-05, + "loss": 0.5687, + "step": 1486 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.38500645975771264, + "learning_rate": 2.1647831241156302e-05, + "loss": 0.6592, + "step": 1487 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4291137226921338, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6636, + "step": 1488 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.4486703074814581, + "learning_rate": 2.1433550001327373e-05, + "loss": 0.6584, + "step": 1489 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.44856230316275914, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.7457, + "step": 1490 + }, + { + "epoch": 0.7952, + "grad_norm": 0.4155731229227311, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6563, + "step": 1491 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.44306480487160593, + "learning_rate": 2.111388852214001e-05, + "loss": 0.5888, + "step": 1492 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.3686384335155605, + "learning_rate": 2.1007805404004242e-05, + "loss": 0.5991, + "step": 1493 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3664143734077232, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6081, + "step": 1494 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.42374095741997003, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.6569, + "step": 1495 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.404103314861036, + "learning_rate": 2.069097260929439e-05, + "loss": 0.6889, + "step": 1496 + }, + { + "epoch": 0.7984, + "grad_norm": 0.37552177045965424, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6132, + "step": 1497 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.38204794186683344, + "learning_rate": 2.048093436450603e-05, + "loss": 0.6706, + "step": 1498 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.3755646096604971, + "learning_rate": 2.0376271269487514e-05, + "loss": 0.6085, + "step": 1499 + }, + { + "epoch": 0.8, + "grad_norm": 0.3904162423032261, + "learning_rate": 2.027184594300898e-05, + "loss": 0.5988, + "step": 1500 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.40648751526987936, + "learning_rate": 2.0167658696900317e-05, + "loss": 0.619, + "step": 1501 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.40139250248289865, + "learning_rate": 2.0063709842280432e-05, + "loss": 0.6293, + "step": 1502 + }, + { + "epoch": 0.8016, + "grad_norm": 0.41177215865758354, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6179, + "step": 1503 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.3966353468417924, + "learning_rate": 1.985652854842247e-05, + "loss": 0.6338, + "step": 1504 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.35642516093977317, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.6419, + "step": 1505 + }, + { + "epoch": 0.8032, + "grad_norm": 0.43270269076477214, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6659, + "step": 1506 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.37599262291588553, + "learning_rate": 1.9547552280792524e-05, + "loss": 0.6378, + "step": 1507 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.5116979587249396, + "learning_rate": 1.9445040268673298e-05, + "loss": 0.7785, + "step": 1508 + }, + { + "epoch": 0.8048, + "grad_norm": 0.42212680329806335, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.696, + "step": 1509 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.3799812964323031, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.6058, + "step": 1510 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.3591460903004341, + "learning_rate": 1.9138948749211472e-05, + "loss": 0.5966, + "step": 1511 + }, + { + "epoch": 0.8064, + "grad_norm": 0.4008722219724583, + "learning_rate": 1.903740076395151e-05, + "loss": 0.644, + "step": 1512 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.4147177426055831, + "learning_rate": 1.8936094545302095e-05, + "loss": 0.5915, + "step": 1513 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.37318212586856425, + "learning_rate": 1.883503039577894e-05, + "loss": 0.5931, + "step": 1514 + }, + { + "epoch": 0.808, + "grad_norm": 0.3621194206193851, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.5921, + "step": 1515 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.43241355380550084, + "learning_rate": 1.8633629510559314e-05, + "loss": 0.6547, + "step": 1516 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.3863238141027008, + "learning_rate": 1.8533293376276472e-05, + "loss": 0.5853, + "step": 1517 + }, + { + "epoch": 0.8096, + "grad_norm": 0.44936168495273077, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.67, + "step": 1518 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.43717066334186366, + "learning_rate": 1.8333351222458407e-05, + "loss": 0.6639, + "step": 1519 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.5033016938744572, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.7191, + "step": 1520 + }, + { + "epoch": 0.8112, + "grad_norm": 0.45779706903350137, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6841, + "step": 1521 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.3507320796895244, + "learning_rate": 1.803526775107217e-05, + "loss": 0.5675, + "step": 1522 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.4098731143735821, + "learning_rate": 1.7936395717326704e-05, + "loss": 0.6578, + "step": 1523 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4083470933188338, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6906, + "step": 1524 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.3760035011253386, + "learning_rate": 1.773938710748706e-05, + "loss": 0.6131, + "step": 1525 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.3435090246365492, + "learning_rate": 1.7641251119690505e-05, + "loss": 0.6245, + "step": 1526 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3611412759226195, + "learning_rate": 1.754336106761927e-05, + "loss": 0.5484, + "step": 1527 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.37981053393979597, + "learning_rate": 1.744571724358789e-05, + "loss": 0.6176, + "step": 1528 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.39732487013425777, + "learning_rate": 1.7348319939175637e-05, + "loss": 0.6303, + "step": 1529 + }, + { + "epoch": 0.816, + "grad_norm": 0.4170444261864867, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6259, + "step": 1530 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.3963048899451401, + "learning_rate": 1.715426605184407e-05, + "loss": 0.5872, + "step": 1531 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.38433101401537006, + "learning_rate": 1.705761004839911e-05, + "loss": 0.6413, + "step": 1532 + }, + { + "epoch": 0.8176, + "grad_norm": 0.40031332813975723, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6614, + "step": 1533 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.39456795129406186, + "learning_rate": 1.6865041365097435e-05, + "loss": 0.6257, + "step": 1534 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.37751709007069784, + "learning_rate": 1.676912926028007e-05, + "loss": 0.6624, + "step": 1535 + }, + { + "epoch": 0.8192, + "grad_norm": 0.41648436193135147, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6717, + "step": 1536 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.40360852282661724, + "learning_rate": 1.6578050956351886e-05, + "loss": 0.6025, + "step": 1537 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.4130380968592082, + "learning_rate": 1.6482885327829913e-05, + "loss": 0.692, + "step": 1538 + }, + { + "epoch": 0.8208, + "grad_norm": 0.41199731700306036, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6288, + "step": 1539 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.36383292387156796, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.5999, + "step": 1540 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.4100830994828841, + "learning_rate": 1.619888594394382e-05, + "loss": 0.6698, + "step": 1541 + }, + { + "epoch": 0.8224, + "grad_norm": 0.4218887808159194, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6669, + "step": 1542 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.47142422539345946, + "learning_rate": 1.601080376443763e-05, + "loss": 0.7068, + "step": 1543 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.45856892382442926, + "learning_rate": 1.5917138741193973e-05, + "loss": 0.7278, + "step": 1544 + }, + { + "epoch": 0.824, + "grad_norm": 0.3992066272331399, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6356, + "step": 1545 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.4118056801784563, + "learning_rate": 1.573056222621453e-05, + "loss": 0.6769, + "step": 1546 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.4037266190552037, + "learning_rate": 1.5637651291624523e-05, + "loss": 0.6252, + "step": 1547 + }, + { + "epoch": 0.8256, + "grad_norm": 0.4093902498313882, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.5753, + "step": 1548 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.4014978890448161, + "learning_rate": 1.5452585455473977e-05, + "loss": 0.6096, + "step": 1549 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.41271772645387594, + "learning_rate": 1.536043110654809e-05, + "loss": 0.6634, + "step": 1550 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4599963465208382, + "learning_rate": 1.526852950422226e-05, + "loss": 0.6722, + "step": 1551 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.37602488809181833, + "learning_rate": 1.5176880922928616e-05, + "loss": 0.6434, + "step": 1552 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.3732185738125047, + "learning_rate": 1.5085485636343755e-05, + "loss": 0.5867, + "step": 1553 + }, + { + "epoch": 0.8288, + "grad_norm": 0.46329867727693624, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.7339, + "step": 1554 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.321860242670046, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.5824, + "step": 1555 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.37045770017891644, + "learning_rate": 1.4812822270257009e-05, + "loss": 0.5911, + "step": 1556 + }, + { + "epoch": 0.8304, + "grad_norm": 0.38642688964530936, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6728, + "step": 1557 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.3544747208762738, + "learning_rate": 1.4632318149739177e-05, + "loss": 0.6381, + "step": 1558 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.3939531237308629, + "learning_rate": 1.454244833620102e-05, + "loss": 0.661, + "step": 1559 + }, + { + "epoch": 0.832, + "grad_norm": 0.40161912955238394, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.599, + "step": 1560 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.42788960629838485, + "learning_rate": 1.4363474544389877e-05, + "loss": 0.7166, + "step": 1561 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.41324912480142906, + "learning_rate": 1.4274371100559791e-05, + "loss": 0.6558, + "step": 1562 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3593332940563285, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.549, + "step": 1563 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.4179808942660934, + "learning_rate": 1.409693244743192e-05, + "loss": 0.6443, + "step": 1564 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.3445566195625704, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.5818, + "step": 1565 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4001823627247086, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6468, + "step": 1566 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.365228111068425, + "learning_rate": 1.3832699022267515e-05, + "loss": 0.6243, + "step": 1567 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.3578894931308083, + "learning_rate": 1.37451354812416e-05, + "loss": 0.5683, + "step": 1568 + }, + { + "epoch": 0.8368, + "grad_norm": 0.4631016919345303, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6711, + "step": 1569 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.41538216659285476, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.6859, + "step": 1570 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.38557045548720886, + "learning_rate": 1.3483991320937306e-05, + "loss": 0.6264, + "step": 1571 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3642820157209577, + "learning_rate": 1.339745962155613e-05, + "loss": 0.5968, + "step": 1572 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.39087622995060745, + "learning_rate": 1.3311186530505838e-05, + "loss": 0.5998, + "step": 1573 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.3790450593972957, + "learning_rate": 1.322517230541096e-05, + "loss": 0.6498, + "step": 1574 + }, + { + "epoch": 0.84, + "grad_norm": 0.38428182843060554, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.5748, + "step": 1575 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.36044515105718045, + "learning_rate": 1.30539214797198e-05, + "loss": 0.6033, + "step": 1576 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.3602870134258713, + "learning_rate": 1.2968685390504465e-05, + "loss": 0.5785, + "step": 1577 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3909645239819257, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6268, + "step": 1578 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.3679001478449064, + "learning_rate": 1.2798993131973091e-05, + "loss": 0.6176, + "step": 1579 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.4578232413768848, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.7315, + "step": 1580 + }, + { + "epoch": 0.8432, + "grad_norm": 0.38246633575623384, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6616, + "step": 1581 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.3278031029085765, + "learning_rate": 1.2546408338544769e-05, + "loss": 0.5653, + "step": 1582 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.3952825621324949, + "learning_rate": 1.2462735372353996e-05, + "loss": 0.6384, + "step": 1583 + }, + { + "epoch": 0.8448, + "grad_norm": 0.40482241550609704, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6229, + "step": 1584 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.4225222734127927, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.6503, + "step": 1585 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.347801809382486, + "learning_rate": 1.2213285866674905e-05, + "loss": 0.5663, + "step": 1586 + }, + { + "epoch": 0.8464, + "grad_norm": 0.4362943523325906, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6135, + "step": 1587 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.4326524921153363, + "learning_rate": 1.2048296504658207e-05, + "loss": 0.715, + "step": 1588 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.4314227468057858, + "learning_rate": 1.1966195656380031e-05, + "loss": 0.7296, + "step": 1589 + }, + { + "epoch": 0.848, + "grad_norm": 0.39005495379037797, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6103, + "step": 1590 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.37697134200698884, + "learning_rate": 1.1802782851111205e-05, + "loss": 0.6663, + "step": 1591 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.3734450064142702, + "learning_rate": 1.1721471382096027e-05, + "loss": 0.586, + "step": 1592 + }, + { + "epoch": 0.8496, + "grad_norm": 0.46314791891224083, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6743, + "step": 1593 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.43504959813464567, + "learning_rate": 1.1559639525345311e-05, + "loss": 0.6121, + "step": 1594 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.36929908422880675, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.6279, + "step": 1595 + }, + { + "epoch": 0.8512, + "grad_norm": 0.49701659054680103, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7483, + "step": 1596 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.4222092092548254, + "learning_rate": 1.1318873061913405e-05, + "loss": 0.676, + "step": 1597 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.45831563430137234, + "learning_rate": 1.123914688596409e-05, + "loss": 0.7232, + "step": 1598 + }, + { + "epoch": 0.8528, + "grad_norm": 0.38874723637823067, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6417, + "step": 1599 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.37989141337877586, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.5809, + "step": 1600 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.4166422428912336, + "learning_rate": 1.1001559626737756e-05, + "loss": 0.6342, + "step": 1601 + }, + { + "epoch": 0.8544, + "grad_norm": 0.36713917128878437, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6315, + "step": 1602 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.37609252189776415, + "learning_rate": 1.0844496540694515e-05, + "loss": 0.6151, + "step": 1603 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.376209779205758, + "learning_rate": 1.0766364228417148e-05, + "loss": 0.636, + "step": 1604 + }, + { + "epoch": 0.856, + "grad_norm": 0.40409534948346404, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6576, + "step": 1605 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.3580445368354473, + "learning_rate": 1.0610899231924886e-05, + "loss": 0.5748, + "step": 1606 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.34008507004316907, + "learning_rate": 1.0533567011952094e-05, + "loss": 0.5227, + "step": 1607 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3685045006334314, + "learning_rate": 1.045650195232819e-05, + "loss": 0.5796, + "step": 1608 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.389301347448525, + "learning_rate": 1.0379704283181179e-05, + "loss": 0.5669, + "step": 1609 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.4083232449511819, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.6492, + "step": 1610 + }, + { + "epoch": 0.8592, + "grad_norm": 0.41614186078286347, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.7204, + "step": 1611 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.42405989365191976, + "learning_rate": 1.0150917907899926e-05, + "loss": 0.6691, + "step": 1612 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.3905776917851294, + "learning_rate": 1.007519208596045e-05, + "loss": 0.6346, + "step": 1613 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4240628512154941, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6504, + "step": 1614 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.42225047989138437, + "learning_rate": 9.924546254786493e-06, + "loss": 0.6832, + "step": 1615 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.4036307026291307, + "learning_rate": 9.849626695403324e-06, + "loss": 0.5814, + "step": 1616 + }, + { + "epoch": 0.8624, + "grad_norm": 0.4032539463748285, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6393, + "step": 1617 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.44355391910769243, + "learning_rate": 9.700595407649805e-06, + "loss": 0.664, + "step": 1618 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.42149209704525703, + "learning_rate": 9.62648412430951e-06, + "loss": 0.6334, + "step": 1619 + }, + { + "epoch": 0.864, + "grad_norm": 0.44447353675726664, + "learning_rate": 9.552642710005299e-06, + "loss": 0.586, + "step": 1620 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.4888778248873591, + "learning_rate": 9.479071385238892e-06, + "loss": 0.6906, + "step": 1621 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.5618455210067266, + "learning_rate": 9.40577036970538e-06, + "loss": 0.6409, + "step": 1622 + }, + { + "epoch": 0.8656, + "grad_norm": 0.48120743644240427, + "learning_rate": 9.332739882292752e-06, + "loss": 0.735, + "step": 1623 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.43725656309249994, + "learning_rate": 9.259980141081115e-06, + "loss": 0.6324, + "step": 1624 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.4078778982634269, + "learning_rate": 9.187491363342093e-06, + "loss": 0.6272, + "step": 1625 + }, + { + "epoch": 0.8672, + "grad_norm": 0.39262040137363846, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6683, + "step": 1626 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.4056787918545523, + "learning_rate": 9.043327563322112e-06, + "loss": 0.6422, + "step": 1627 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.3710993037706992, + "learning_rate": 8.971652971536148e-06, + "loss": 0.6207, + "step": 1628 + }, + { + "epoch": 0.8688, + "grad_norm": 0.4181428824263666, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6546, + "step": 1629 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.36225059464645426, + "learning_rate": 8.829119474567671e-06, + "loss": 0.6119, + "step": 1630 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.4251014294424469, + "learning_rate": 8.758260995011825e-06, + "loss": 0.663, + "step": 1631 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4756696780031577, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6912, + "step": 1632 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.3554683301896617, + "learning_rate": 8.617361631727138e-06, + "loss": 0.548, + "step": 1633 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.603016536409824, + "learning_rate": 8.547321168745193e-06, + "loss": 0.7974, + "step": 1634 + }, + { + "epoch": 0.872, + "grad_norm": 0.3923180270827677, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6231, + "step": 1635 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.3929339474375356, + "learning_rate": 8.408059725858719e-06, + "loss": 0.5855, + "step": 1636 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.3852648359159864, + "learning_rate": 8.338839161809997e-06, + "loss": 0.6461, + "step": 1637 + }, + { + "epoch": 0.8736, + "grad_norm": 0.38968951516630496, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6216, + "step": 1638 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.3797986191453474, + "learning_rate": 8.201219382016556e-06, + "loss": 0.6372, + "step": 1639 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.38303609001935035, + "learning_rate": 8.132820577225387e-06, + "loss": 0.5823, + "step": 1640 + }, + { + "epoch": 0.8752, + "grad_norm": 0.4366958103642028, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6372, + "step": 1641 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.42322691632058274, + "learning_rate": 7.996846159099557e-06, + "loss": 0.6362, + "step": 1642 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.3891173311203792, + "learning_rate": 7.929270951805178e-06, + "loss": 0.5992, + "step": 1643 + }, + { + "epoch": 0.8768, + "grad_norm": 0.37586740269463964, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6811, + "step": 1644 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.3702660037262906, + "learning_rate": 7.794945549701993e-06, + "loss": 0.6075, + "step": 1645 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.40831551285631984, + "learning_rate": 7.728195756009204e-06, + "loss": 0.639, + "step": 1646 + }, + { + "epoch": 0.8784, + "grad_norm": 0.42505489999592005, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6602, + "step": 1647 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.4634256620993115, + "learning_rate": 7.595522979965819e-06, + "loss": 0.6652, + "step": 1648 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.3975136154435997, + "learning_rate": 7.529600393796232e-06, + "loss": 0.6182, + "step": 1649 + }, + { + "epoch": 0.88, + "grad_norm": 0.35900290259848433, + "learning_rate": 7.463953938275858e-06, + "loss": 0.5408, + "step": 1650 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.5238800168777887, + "learning_rate": 7.3985838094349444e-06, + "loss": 0.7043, + "step": 1651 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.41110094511302564, + "learning_rate": 7.333490202478666e-06, + "loss": 0.6375, + "step": 1652 + }, + { + "epoch": 0.8816, + "grad_norm": 0.41615166707814594, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6578, + "step": 1653 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.42650553868054925, + "learning_rate": 7.204133330911178e-06, + "loss": 0.6431, + "step": 1654 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.5233343583632561, + "learning_rate": 7.1398704525792e-06, + "loss": 0.7134, + "step": 1655 + }, + { + "epoch": 0.8832, + "grad_norm": 0.4165457177974901, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6196, + "step": 1656 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.6116535667977607, + "learning_rate": 7.012176770311862e-06, + "loss": 0.6383, + "step": 1657 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.396977276556628, + "learning_rate": 6.948746347689183e-06, + "loss": 0.6391, + "step": 1658 + }, + { + "epoch": 0.8848, + "grad_norm": 0.37237274450227187, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5685, + "step": 1659 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.3730492247703517, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.6381, + "step": 1660 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.43171087207029185, + "learning_rate": 6.760123024328624e-06, + "loss": 0.6581, + "step": 1661 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3594653058558447, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5859, + "step": 1662 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.40031606713438245, + "learning_rate": 6.635765971293484e-06, + "loss": 0.6098, + "step": 1663 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.3577652012444116, + "learning_rate": 6.5740055518083375e-06, + "loss": 0.595, + "step": 1664 + }, + { + "epoch": 0.888, + "grad_norm": 0.33568210599329296, + "learning_rate": 6.512524116523633e-06, + "loss": 0.5656, + "step": 1665 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.3757493259664873, + "learning_rate": 6.451321849032288e-06, + "loss": 0.6518, + "step": 1666 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.37183669635879696, + "learning_rate": 6.390398932093555e-06, + "loss": 0.5944, + "step": 1667 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3837060792491067, + "learning_rate": 6.329755547632499e-06, + "loss": 0.579, + "step": 1668 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.3869512633232343, + "learning_rate": 6.269391876739495e-06, + "loss": 0.6089, + "step": 1669 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.38558581631332584, + "learning_rate": 6.209308099669597e-06, + "loss": 0.5621, + "step": 1670 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3620135333664394, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6053, + "step": 1671 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.399758881219958, + "learning_rate": 6.089980943839924e-06, + "loss": 0.5723, + "step": 1672 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.4116930783863641, + "learning_rate": 6.030737921409169e-06, + "loss": 0.7012, + "step": 1673 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3676953632029055, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6175, + "step": 1674 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.38209507850251573, + "learning_rate": 5.913093872058528e-06, + "loss": 0.551, + "step": 1675 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.39917237518155163, + "learning_rate": 5.854693196441641e-06, + "loss": 0.5764, + "step": 1676 + }, + { + "epoch": 0.8944, + "grad_norm": 0.3774143488798994, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6169, + "step": 1677 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.4326734293746316, + "learning_rate": 5.738735415290642e-06, + "loss": 0.5727, + "step": 1678 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.46639365334044075, + "learning_rate": 5.681178656024055e-06, + "loss": 0.7644, + "step": 1679 + }, + { + "epoch": 0.896, + "grad_norm": 0.41650925093634616, + "learning_rate": 5.623903547074549e-06, + "loss": 0.7055, + "step": 1680 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.425413643257288, + "learning_rate": 5.566910259474289e-06, + "loss": 0.6392, + "step": 1681 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.39278957561917244, + "learning_rate": 5.510198963413881e-06, + "loss": 0.6099, + "step": 1682 + }, + { + "epoch": 0.8976, + "grad_norm": 0.390744500709825, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6568, + "step": 1683 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.4073043144813503, + "learning_rate": 5.397623022464226e-06, + "loss": 0.594, + "step": 1684 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.4154341662053565, + "learning_rate": 5.341758713743828e-06, + "loss": 0.6985, + "step": 1685 + }, + { + "epoch": 0.8992, + "grad_norm": 0.43612098721955833, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6879, + "step": 1686 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.4580436108523151, + "learning_rate": 5.230878253907912e-06, + "loss": 0.6894, + "step": 1687 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.45484764609805073, + "learning_rate": 5.175862433898282e-06, + "loss": 0.7056, + "step": 1688 + }, + { + "epoch": 0.9008, + "grad_norm": 0.4056923813691809, + "learning_rate": 5.121129773156663e-06, + "loss": 0.5872, + "step": 1689 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.3959728264489669, + "learning_rate": 5.066680435123106e-06, + "loss": 0.6153, + "step": 1690 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.38389124048407347, + "learning_rate": 5.012514582391592e-06, + "loss": 0.6166, + "step": 1691 + }, + { + "epoch": 0.9024, + "grad_norm": 0.37726113265552336, + "learning_rate": 4.95863237670956e-06, + "loss": 0.676, + "step": 1692 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.39381983874735715, + "learning_rate": 4.905033978977491e-06, + "loss": 0.6252, + "step": 1693 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.38630816177028143, + "learning_rate": 4.851719549248301e-06, + "loss": 0.6824, + "step": 1694 + }, + { + "epoch": 0.904, + "grad_norm": 0.3938486563850244, + "learning_rate": 4.798689246727006e-06, + "loss": 0.5795, + "step": 1695 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.3935846533547814, + "learning_rate": 4.745943229770122e-06, + "loss": 0.6273, + "step": 1696 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.4016983236901665, + "learning_rate": 4.693481655885257e-06, + "loss": 0.6784, + "step": 1697 + }, + { + "epoch": 0.9056, + "grad_norm": 0.42610143158704605, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6469, + "step": 1698 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.37841507863339735, + "learning_rate": 4.58941246311464e-06, + "loss": 0.6772, + "step": 1699 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.43091005059953413, + "learning_rate": 4.537805154995278e-06, + "loss": 0.6265, + "step": 1700 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3862461493501147, + "learning_rate": 4.486482911479839e-06, + "loss": 0.5994, + "step": 1701 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.45623319790774136, + "learning_rate": 4.435445885824285e-06, + "loss": 0.7243, + "step": 1702 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.3914923161785018, + "learning_rate": 4.384694230432984e-06, + "loss": 0.6274, + "step": 1703 + }, + { + "epoch": 0.9088, + "grad_norm": 0.37402162465050454, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6032, + "step": 1704 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.4507907929467838, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.6729, + "step": 1705 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.3661986883957363, + "learning_rate": 4.2341529971023255e-06, + "loss": 0.5669, + "step": 1706 + }, + { + "epoch": 0.9104, + "grad_norm": 0.41388626248702587, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6383, + "step": 1707 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.3663264719910165, + "learning_rate": 4.135221781914034e-06, + "loss": 0.6319, + "step": 1708 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.38342215848984346, + "learning_rate": 4.0861855008460405e-06, + "loss": 0.565, + "step": 1709 + }, + { + "epoch": 0.912, + "grad_norm": 0.4429325625906281, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6849, + "step": 1710 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.40666064686143744, + "learning_rate": 3.988972323910778e-06, + "loss": 0.6665, + "step": 1711 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.39052164502879655, + "learning_rate": 3.9407957183368095e-06, + "loss": 0.6324, + "step": 1712 + }, + { + "epoch": 0.9136, + "grad_norm": 0.41727403909031907, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6476, + "step": 1713 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.425226205526703, + "learning_rate": 3.845303192289074e-06, + "loss": 0.5984, + "step": 1714 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.42989062615720675, + "learning_rate": 3.797987556970495e-06, + "loss": 0.6723, + "step": 1715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.4636995124081845, + "learning_rate": 3.750959195463466e-06, + "loss": 0.7219, + "step": 1716 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.3605121368134305, + "learning_rate": 3.7042182482018075e-06, + "loss": 0.6043, + "step": 1717 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.38272748533172646, + "learning_rate": 3.6577648547611033e-06, + "loss": 0.6545, + "step": 1718 + }, + { + "epoch": 0.9168, + "grad_norm": 0.3522011131041464, + "learning_rate": 3.611599153858214e-06, + "loss": 0.5692, + "step": 1719 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.49209849132812505, + "learning_rate": 3.565721283350931e-06, + "loss": 0.6871, + "step": 1720 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.47373614480469567, + "learning_rate": 3.5201313802375456e-06, + "loss": 0.6194, + "step": 1721 + }, + { + "epoch": 0.9184, + "grad_norm": 0.37572241258048084, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6331, + "step": 1722 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.41192517155161484, + "learning_rate": 3.4298160198856568e-06, + "loss": 0.6621, + "step": 1723 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.34315006199865633, + "learning_rate": 3.3850908323424967e-06, + "loss": 0.5141, + "step": 1724 + }, + { + "epoch": 0.92, + "grad_norm": 0.38284360446064186, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.693, + "step": 1725 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.42746887049605614, + "learning_rate": 3.296506110302422e-06, + "loss": 0.6035, + "step": 1726 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.3717543292629806, + "learning_rate": 3.252646840332918e-06, + "loss": 0.5495, + "step": 1727 + }, + { + "epoch": 0.9216, + "grad_norm": 0.4416172856217034, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6688, + "step": 1728 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.40412629092513996, + "learning_rate": 3.1657951373467497e-06, + "loss": 0.5929, + "step": 1729 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.548109727386344, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.6094, + "step": 1730 + }, + { + "epoch": 0.9232, + "grad_norm": 0.37109535691540146, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6617, + "step": 1731 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.38958918145454857, + "learning_rate": 3.037686613916857e-06, + "loss": 0.6604, + "step": 1732 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.324289583165879, + "learning_rate": 2.995562691985898e-06, + "loss": 0.5615, + "step": 1733 + }, + { + "epoch": 0.9248, + "grad_norm": 0.33730563151936555, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.543, + "step": 1734 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.3420527854088097, + "learning_rate": 2.912183982969385e-06, + "loss": 0.5682, + "step": 1735 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.6141191055395692, + "learning_rate": 2.8709294448653225e-06, + "loss": 0.7235, + "step": 1736 + }, + { + "epoch": 0.9264, + "grad_norm": 0.4081644698765578, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.5907, + "step": 1737 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.4304989232114822, + "learning_rate": 2.789290617426765e-06, + "loss": 0.6644, + "step": 1738 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.44572336968832404, + "learning_rate": 2.748906571878207e-06, + "loss": 0.7001, + "step": 1739 + }, + { + "epoch": 0.928, + "grad_norm": 0.4304277308592198, + "learning_rate": 2.708812932856253e-06, + "loss": 0.5982, + "step": 1740 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.42531278167401115, + "learning_rate": 2.6690098200866098e-06, + "loss": 0.664, + "step": 1741 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.4149536644687968, + "learning_rate": 2.6294973524274125e-06, + "loss": 0.6007, + "step": 1742 + }, + { + "epoch": 0.9296, + "grad_norm": 0.43253782934120905, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6781, + "step": 1743 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.35974443341526036, + "learning_rate": 2.551344823532964e-06, + "loss": 0.594, + "step": 1744 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.41653066509061626, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.6348, + "step": 1745 + }, + { + "epoch": 0.9312, + "grad_norm": 0.4413389196480324, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6193, + "step": 1746 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.3688000579881498, + "learning_rate": 2.436298790049363e-06, + "loss": 0.5977, + "step": 1747 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.4199307911083171, + "learning_rate": 2.3985326404461604e-06, + "loss": 0.6777, + "step": 1748 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3970606163052576, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6222, + "step": 1749 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.4174698629182832, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.6815, + "step": 1750 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.4036921769098127, + "learning_rate": 2.286983355164529e-06, + "loss": 0.6032, + "step": 1751 + }, + { + "epoch": 0.9344, + "grad_norm": 0.4285008185444954, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6643, + "step": 1752 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.36953939602361985, + "learning_rate": 2.2140759094162467e-06, + "loss": 0.6107, + "step": 1753 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.4132918798454661, + "learning_rate": 2.178060137750071e-06, + "loss": 0.6374, + "step": 1754 + }, + { + "epoch": 0.936, + "grad_norm": 0.39093970713003195, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5885, + "step": 1755 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.45277043620307783, + "learning_rate": 2.106905034576112e-06, + "loss": 0.6468, + "step": 1756 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.38838921651357705, + "learning_rate": 2.0717659155482738e-06, + "loss": 0.6715, + "step": 1757 + }, + { + "epoch": 0.9376, + "grad_norm": 0.35170101651760033, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6085, + "step": 1758 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.40977188438436474, + "learning_rate": 2.002365067264289e-06, + "loss": 0.6122, + "step": 1759 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.4999692476876301, + "learning_rate": 1.968103545249611e-06, + "loss": 0.7785, + "step": 1760 + }, + { + "epoch": 0.9392, + "grad_norm": 0.44579608014517047, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6574, + "step": 1761 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.38393065737591503, + "learning_rate": 1.900458817025097e-06, + "loss": 0.6132, + "step": 1762 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.39650180104626154, + "learning_rate": 1.8670758128126909e-06, + "loss": 0.6378, + "step": 1763 + }, + { + "epoch": 0.9408, + "grad_norm": 0.37566035110459617, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.5903, + "step": 1764 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.3779202134176613, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.6376, + "step": 1765 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.45430837631109716, + "learning_rate": 1.7686854333893833e-06, + "loss": 0.6216, + "step": 1766 + }, + { + "epoch": 0.9424, + "grad_norm": 0.380466844962673, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6337, + "step": 1767 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.45972781704796434, + "learning_rate": 1.7045583519583074e-06, + "loss": 0.6983, + "step": 1768 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.36117014737348896, + "learning_rate": 1.6729350512519005e-06, + "loss": 0.6296, + "step": 1769 + }, + { + "epoch": 0.944, + "grad_norm": 0.3938199548858364, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6015, + "step": 1770 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.37137212354045274, + "learning_rate": 1.6105694020169593e-06, + "loss": 0.5782, + "step": 1771 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.3882336811470042, + "learning_rate": 1.5798272397217095e-06, + "loss": 0.5943, + "step": 1772 + }, + { + "epoch": 0.9456, + "grad_norm": 0.4376675474062556, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.6402, + "step": 1773 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.3965225862235942, + "learning_rate": 1.5192246987791981e-06, + "loss": 0.6234, + "step": 1774 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.38088896296122987, + "learning_rate": 1.489364501100332e-06, + "loss": 0.6113, + "step": 1775 + }, + { + "epoch": 0.9472, + "grad_norm": 0.40567672292834395, + "learning_rate": 1.459798471131868e-06, + "loss": 0.579, + "step": 1776 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.39066870639184614, + "learning_rate": 1.430526697162482e-06, + "loss": 0.6062, + "step": 1777 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.43513431531760194, + "learning_rate": 1.4015492666021312e-06, + "loss": 0.6324, + "step": 1778 + }, + { + "epoch": 0.9488, + "grad_norm": 0.47798973632227304, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.7004, + "step": 1779 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.3437629351738168, + "learning_rate": 1.344477780953346e-06, + "loss": 0.5798, + "step": 1780 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.5310871525203585, + "learning_rate": 1.3163838962890195e-06, + "loss": 0.7382, + "step": 1781 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3380724272652971, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.5703, + "step": 1782 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.3692754455038017, + "learning_rate": 1.261080262743297e-06, + "loss": 0.6134, + "step": 1783 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.46281748086123625, + "learning_rate": 1.2338706790069431e-06, + "loss": 0.6626, + "step": 1784 + }, + { + "epoch": 0.952, + "grad_norm": 0.4344435512092572, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6054, + "step": 1785 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.4376945848340338, + "learning_rate": 1.1803363838667092e-06, + "loss": 0.6312, + "step": 1786 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.38851437739746575, + "learning_rate": 1.1540118323243865e-06, + "loss": 0.6102, + "step": 1787 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3791549299691001, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.5704, + "step": 1788 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.34637671330326325, + "learning_rate": 1.1022483143405705e-06, + "loss": 0.569, + "step": 1789 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.397350750997209, + "learning_rate": 1.076809502472831e-06, + "loss": 0.6319, + "step": 1790 + }, + { + "epoch": 0.9552, + "grad_norm": 0.45244614874524447, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6244, + "step": 1791 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.4220643235107355, + "learning_rate": 1.0268181528061749e-06, + "loss": 0.6361, + "step": 1792 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.398898706807609, + "learning_rate": 1.0022657642890231e-06, + "loss": 0.6188, + "step": 1793 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3431482155645094, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5479, + "step": 1794 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.4041433878134855, + "learning_rate": 9.540479264726676e-07, + "loss": 0.6493, + "step": 1795 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.37743795314295875, + "learning_rate": 9.303826211592315e-07, + "loss": 0.6491, + "step": 1796 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4370842130641423, + "learning_rate": 9.070131527609604e-07, + "loss": 0.584, + "step": 1797 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.40030636657203433, + "learning_rate": 8.839395910626213e-07, + "loss": 0.6349, + "step": 1798 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.34671994474872103, + "learning_rate": 8.611620049653879e-07, + "loss": 0.5521, + "step": 1799 + }, + { + "epoch": 0.96, + "grad_norm": 0.4450820840061595, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6631, + "step": 1800 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.39007069670154804, + "learning_rate": 8.16495030759501e-07, + "loss": 0.6345, + "step": 1801 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.4845383265828704, + "learning_rate": 7.946057760332193e-07, + "loss": 0.6056, + "step": 1802 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3805334869970108, + "learning_rate": 7.730127636723539e-07, + "loss": 0.5943, + "step": 1803 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.4392854524912693, + "learning_rate": 7.517160581569372e-07, + "loss": 0.6876, + "step": 1804 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.46948718919118265, + "learning_rate": 7.307157230821426e-07, + "loss": 0.6842, + "step": 1805 + }, + { + "epoch": 0.9632, + "grad_norm": 0.42666707394312775, + "learning_rate": 7.100118211581852e-07, + "loss": 0.7057, + "step": 1806 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.4691367931022991, + "learning_rate": 6.896044142100433e-07, + "loss": 0.6714, + "step": 1807 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.37921786631867405, + "learning_rate": 6.694935631773258e-07, + "loss": 0.6112, + "step": 1808 + }, + { + "epoch": 0.9648, + "grad_norm": 0.46148125725126665, + "learning_rate": 6.496793281141056e-07, + "loss": 0.7639, + "step": 1809 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.4844058606638755, + "learning_rate": 6.301617681886863e-07, + "loss": 0.7221, + "step": 1810 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.406922952308384, + "learning_rate": 6.109409416834688e-07, + "loss": 0.6234, + "step": 1811 + }, + { + "epoch": 0.9664, + "grad_norm": 0.38163981720304063, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6152, + "step": 1812 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.37315891106541976, + "learning_rate": 5.733897176325665e-07, + "loss": 0.6172, + "step": 1813 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.3889085407281807, + "learning_rate": 5.550594322205504e-07, + "loss": 0.6271, + "step": 1814 + }, + { + "epoch": 0.968, + "grad_norm": 0.40090834399965586, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6137, + "step": 1815 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.40461055305320026, + "learning_rate": 5.192897883082747e-07, + "loss": 0.6256, + "step": 1816 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.38194267251555764, + "learning_rate": 5.018505366216175e-07, + "loss": 0.6062, + "step": 1817 + }, + { + "epoch": 0.9696, + "grad_norm": 0.43002286383319605, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6104, + "step": 1818 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.5208208467308275, + "learning_rate": 4.678634341683252e-07, + "loss": 0.672, + "step": 1819 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.42246477229007084, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.6144, + "step": 1820 + }, + { + "epoch": 0.9712, + "grad_norm": 0.42363089684681204, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6333, + "step": 1821 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.4419638503758844, + "learning_rate": 4.191120373120749e-07, + "loss": 0.6045, + "step": 1822 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.3869966116104389, + "learning_rate": 4.034562351727389e-07, + "loss": 0.6064, + "step": 1823 + }, + { + "epoch": 0.9728, + "grad_norm": 0.39819979538946243, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6683, + "step": 1824 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.38011596539787706, + "learning_rate": 3.73036907948543e-07, + "loss": 0.5736, + "step": 1825 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.3746452885852023, + "learning_rate": 3.582734737004101e-07, + "loss": 0.61, + "step": 1826 + }, + { + "epoch": 0.9744, + "grad_norm": 0.47900250653324616, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6865, + "step": 1827 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.3525264619604992, + "learning_rate": 3.296392843612273e-07, + "loss": 0.5629, + "step": 1828 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.4955924167435773, + "learning_rate": 3.1576861477621287e-07, + "loss": 0.7149, + "step": 1829 + }, + { + "epoch": 0.976, + "grad_norm": 0.4184355019656372, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.6345, + "step": 1830 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.3938800613710465, + "learning_rate": 2.889203328748424e-07, + "loss": 0.6386, + "step": 1831 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.3982472188768771, + "learning_rate": 2.759428007315212e-07, + "loss": 0.6034, + "step": 1832 + }, + { + "epoch": 0.9776, + "grad_norm": 0.37240093380425443, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6225, + "step": 1833 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.4214353885560392, + "learning_rate": 2.5088114782392257e-07, + "loss": 0.6532, + "step": 1834 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.41423386651278216, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.6109, + "step": 1835 + }, + { + "epoch": 0.9792, + "grad_norm": 0.6273923580496839, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6919, + "step": 1836 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.41093764346339123, + "learning_rate": 2.15522751523467e-07, + "loss": 0.6575, + "step": 1837 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.4177778256201117, + "learning_rate": 2.0433251657653308e-07, + "loss": 0.6339, + "step": 1838 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4095187632595675, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.589, + "step": 1839 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.40917192662630975, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.5995, + "step": 1840 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.36597627394304755, + "learning_rate": 1.7254997101500137e-07, + "loss": 0.6336, + "step": 1841 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3943987511881223, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6231, + "step": 1842 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.41345739895333505, + "learning_rate": 1.5285205417319149e-07, + "loss": 0.6544, + "step": 1843 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.4024778477907338, + "learning_rate": 1.4345031937879062e-07, + "loss": 0.6262, + "step": 1844 + }, + { + "epoch": 0.984, + "grad_norm": 0.47877298730134144, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6805, + "step": 1845 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.4232211837735616, + "learning_rate": 1.255414374179531e-07, + "loss": 0.6415, + "step": 1846 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.3626033796777617, + "learning_rate": 1.170343437301491e-07, + "loss": 0.5324, + "step": 1847 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4424864683447427, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6041, + "step": 1848 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.3991040889335519, + "learning_rate": 1.0091497795706728e-07, + "loss": 0.6313, + "step": 1849 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.3712157586467656, + "learning_rate": 9.330275400666332e-08, + "loss": 0.5788, + "step": 1850 + }, + { + "epoch": 0.9872, + "grad_norm": 0.39765646907224256, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6132, + "step": 1851 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.45976109755188366, + "learning_rate": 7.8973337634336e-08, + "loss": 0.6972, + "step": 1852 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.3958638220607664, + "learning_rate": 7.225618800222877e-08, + "loss": 0.6145, + "step": 1853 + }, + { + "epoch": 0.9888, + "grad_norm": 0.46242520338791676, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6866, + "step": 1854 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.41695337344509853, + "learning_rate": 5.971710613821291e-08, + "loss": 0.6614, + "step": 1855 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.4227237033985834, + "learning_rate": 5.389521134989695e-08, + "loss": 0.6806, + "step": 1856 + }, + { + "epoch": 0.9904, + "grad_norm": 0.4829258985045428, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6839, + "step": 1857 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.4294697874822055, + "learning_rate": 4.314680098592705e-08, + "loss": 0.661, + "step": 1858 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.3901223575904081, + "learning_rate": 3.8220317506654226e-08, + "loss": 0.6105, + "step": 1859 + }, + { + "epoch": 0.992, + "grad_norm": 0.40648596688328853, + "learning_rate": 3.359233507459481e-08, + "loss": 0.6426, + "step": 1860 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.3686515189571681, + "learning_rate": 2.9262867509605163e-08, + "loss": 0.5758, + "step": 1861 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.6166304982855115, + "learning_rate": 2.5231927740154704e-08, + "loss": 0.6289, + "step": 1862 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4229002483853549, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6452, + "step": 1863 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.3963059097706496, + "learning_rate": 1.8065678844314538e-08, + "loss": 0.6044, + "step": 1864 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.3920565762890434, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.6373, + "step": 1865 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4401043721301902, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6717, + "step": 1866 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.3856518849580394, + "learning_rate": 9.555535917993297e-09, + "loss": 0.5939, + "step": 1867 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.4074397128158654, + "learning_rate": 7.315984495548378e-09, + "loss": 0.6534, + "step": 1868 + }, + { + "epoch": 0.9968, + "grad_norm": 0.3903679497354316, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6001, + "step": 1869 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.39556118144329355, + "learning_rate": 3.732667443390181e-09, + "loss": 0.6371, + "step": 1870 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.3570559166896929, + "learning_rate": 2.388912514017516e-09, + "loss": 0.5798, + "step": 1871 + }, + { + "epoch": 0.9984, + "grad_norm": 0.43848927183099673, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6691, + "step": 1872 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.3887278108125185, + "learning_rate": 5.972299119250125e-10, + "loss": 0.6331, + "step": 1873 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.3720451458593415, + "learning_rate": 1.4930758944764479e-10, + "loss": 0.6607, + "step": 1874 + }, + { + "epoch": 1.0, + "grad_norm": 0.42451716518267596, + "learning_rate": 0.0, + "loss": 0.6693, + "step": 1875 + }, + { + "epoch": 1.0, + "step": 1875, + "total_flos": 1690936583258112.0, + "train_loss": 0.7105037103970846, + "train_runtime": 29377.3841, + "train_samples_per_second": 1.021, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 1875, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1690936583258112.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..30b813290ec4aa61eeeba82f3f4c5009897d19f4 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "up_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0cae4f2aff9c5d14ef610908eed96b046e9c7aee --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d612e19a7470cc3e3a55d6951195e5cd1768350e9cfdaac960417de402b7a29a +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..be8efead8647943c5b4f26116482153b15c7eae5 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c8d63fc4cbc4f145985f596f2c6af645c7ee34970ed978ab9d4cedadeae8872 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f66f6f388913ee61c26ce37fb0c8b72c41ba9ae5 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_1_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.706372384637355, + "learning_rate": 2e-05, + "loss": 1.2116, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7132577302985311, + "learning_rate": 4e-05, + "loss": 1.2159, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7576851087474644, + "learning_rate": 6e-05, + "loss": 1.3365, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7251469370332176, + "learning_rate": 8e-05, + "loss": 1.2046, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.7543909315754976, + "learning_rate": 0.0001, + "loss": 1.1315, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7710513379038294, + "learning_rate": 0.00012, + "loss": 1.0763, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8700130881117148, + "learning_rate": 0.00014, + "loss": 1.0161, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7062873685696989, + "learning_rate": 0.00016, + "loss": 0.9766, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6276719913686867, + "learning_rate": 0.00018, + "loss": 0.8885, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.48268253814003653, + "learning_rate": 0.0002, + "loss": 0.9105, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6046383023845995, + "learning_rate": 0.00019999458931878073, + "loss": 0.9888, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5207115213993277, + "learning_rate": 0.0001999783578606323, + "loss": 0.9443, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4804285710737122, + "learning_rate": 0.00019995130738201966, + "loss": 0.8564, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5545095568507734, + "learning_rate": 0.0001999134408101731, + "loss": 0.9725, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.5511366684926723, + "learning_rate": 0.00019986476224277165, + "loss": 0.9165, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.526546815165093, + "learning_rate": 0.00019980527694749952, + "loss": 0.9242, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.5320719142330005, + "learning_rate": 0.00019973499136147606, + "loss": 0.8972, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.49262475460436955, + "learning_rate": 0.0001996539130905593, + "loss": 0.8967, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.46026944803598585, + "learning_rate": 0.0001995620509085228, + "loss": 0.8632, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.47274515264742456, + "learning_rate": 0.00019945941475610623, + "loss": 0.8848, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.46436181337012583, + "learning_rate": 0.0001993460157399396, + "loss": 0.8669, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.41213506760026203, + "learning_rate": 0.0001992218661313415, + "loss": 0.8417, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.40866649316812587, + "learning_rate": 0.00019908697936499103, + "loss": 0.8383, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5895646382759859, + "learning_rate": 0.00019894137003747403, + "loss": 0.8898, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.4754218774744775, + "learning_rate": 0.00019878505390570362, + "loss": 0.8739, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.42615491071092587, + "learning_rate": 0.00019861804788521493, + "loss": 0.8955, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4973425567847533, + "learning_rate": 0.00019844037004833473, + "loss": 0.9069, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.42284857437958, + "learning_rate": 0.00019825203962222572, + "loss": 0.8475, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.42719840251241153, + "learning_rate": 0.0001980530769868059, + "loss": 0.8369, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.4361087817832279, + "learning_rate": 0.00019784350367254322, + "loss": 0.8563, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3870466447188877, + "learning_rate": 0.0001976233423581255, + "loss": 0.8284, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4333186810342646, + "learning_rate": 0.0001973926168680066, + "loss": 0.803, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5057877523198733, + "learning_rate": 0.00019715135216982798, + "loss": 0.9339, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.39771554288914684, + "learning_rate": 0.0001968995743717171, + "loss": 0.8405, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.39423571382411393, + "learning_rate": 0.00019663731071946206, + "loss": 0.7895, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4145519181964843, + "learning_rate": 0.00019636458959356316, + "loss": 0.8203, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.41487025580304077, + "learning_rate": 0.0001960814405061619, + "loss": 0.849, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.41940164168771354, + "learning_rate": 0.00019578789409784727, + "loss": 0.7895, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4820667535546615, + "learning_rate": 0.00019548398213434007, + "loss": 0.8965, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.4665067823750921, + "learning_rate": 0.00019516973750305532, + "loss": 0.9094, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5265719547596305, + "learning_rate": 0.00019484519420954354, + "loss": 0.8688, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.42260029299982316, + "learning_rate": 0.00019451038737381077, + "loss": 0.781, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.46029663275021715, + "learning_rate": 0.00019416535322651818, + "loss": 0.8734, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4032305072429021, + "learning_rate": 0.00019381012910506146, + "loss": 0.7946, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.4271057343845864, + "learning_rate": 0.00019344475344953012, + "loss": 0.862, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.43514680222912955, + "learning_rate": 0.00019306926579854821, + "loss": 0.9001, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3994355342843695, + "learning_rate": 0.00019268370678499533, + "loss": 0.8053, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.3852189335285682, + "learning_rate": 0.0001922881181316097, + "loss": 0.8227, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.42218700595202424, + "learning_rate": 0.00019188254264647337, + "loss": 0.9141, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.4134418764962068, + "learning_rate": 0.0001914670242183795, + "loss": 0.8425, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.4365271441054017, + "learning_rate": 0.0001910416078120832, + "loss": 0.8687, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4198113121039888, + "learning_rate": 0.0001906063394634356, + "loss": 0.7688, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.405081556296362, + "learning_rate": 0.00019016126627440237, + "loss": 0.8238, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3966738782637263, + "learning_rate": 0.00018970643640796642, + "loss": 0.8059, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.4216526542500135, + "learning_rate": 0.000189241899082916, + "loss": 0.8174, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.40486760918324616, + "learning_rate": 0.00018876770456851877, + "loss": 0.7881, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.372170659374174, + "learning_rate": 0.0001882839041790818, + "loss": 0.813, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3623051107925519, + "learning_rate": 0.00018779055026839868, + "loss": 0.7411, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4399934756341017, + "learning_rate": 0.00018728769622408423, + "loss": 0.7899, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.37612989675354286, + "learning_rate": 0.00018677539646179707, + "loss": 0.8054, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4009296252617688, + "learning_rate": 0.00018625370641935129, + "loss": 0.7704, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4375959043493014, + "learning_rate": 0.00018572268255071718, + "loss": 0.85, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4058068414533547, + "learning_rate": 0.00018518238231991218, + "loss": 0.8767, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.45318507931954866, + "learning_rate": 0.00018463286419478255, + "loss": 0.9078, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.47499487962131176, + "learning_rate": 0.00018407418764067627, + "loss": 0.8995, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4112047746691982, + "learning_rate": 0.00018350641311400812, + "loss": 0.7646, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4599444309834608, + "learning_rate": 0.0001829296020557174, + "loss": 0.8395, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4153268210964602, + "learning_rate": 0.00018234381688461942, + "loss": 0.8149, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4095997162714167, + "learning_rate": 0.0001817491209906506, + "loss": 0.7965, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.37632430395084054, + "learning_rate": 0.00018114557872800905, + "loss": 0.7468, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 1.754470990635494, + "learning_rate": 0.00018053325540819045, + "loss": 0.7746, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.43862893440675715, + "learning_rate": 0.0001799122172929206, + "loss": 0.8312, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3730853254794896, + "learning_rate": 0.00017928253158698473, + "loss": 0.7838, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.43501234736399524, + "learning_rate": 0.0001786442664309554, + "loss": 0.8121, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.4315395622099375, + "learning_rate": 0.0001779974908938184, + "loss": 0.8045, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.45243219214473296, + "learning_rate": 0.0001773422749654988, + "loss": 0.7998, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.38158104127374903, + "learning_rate": 0.00017667868954928694, + "loss": 0.7273, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.39029421047072665, + "learning_rate": 0.00017600680645416583, + "loss": 0.8051, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4448685187536034, + "learning_rate": 0.00017532669838704035, + "loss": 0.821, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.40437396682659577, + "learning_rate": 0.00017463843894486937, + "loss": 0.8099, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.39897213964130696, + "learning_rate": 0.0001739421026067017, + "loss": 0.7215, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3830906985552, + "learning_rate": 0.00017323776472561627, + "loss": 0.7608, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4293136276970088, + "learning_rate": 0.00017252550152056795, + "loss": 0.8422, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.3990501135156007, + "learning_rate": 0.0001718053900681397, + "loss": 0.7954, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.3969139416853821, + "learning_rate": 0.00017107750829420176, + "loss": 0.7744, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.46193230448634365, + "learning_rate": 0.00017034193496547902, + "loss": 0.8728, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.44007549015603314, + "learning_rate": 0.00016959874968102735, + "loss": 0.8259, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3869222724070434, + "learning_rate": 0.00016884803286362, + "loss": 0.7533, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.41884480439098987, + "learning_rate": 0.00016808986575104465, + "loss": 0.8667, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.43009413537086566, + "learning_rate": 0.00016732433038731242, + "loss": 0.8322, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.44501336696217714, + "learning_rate": 0.0001665515096137797, + "loss": 0.8458, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4117672217268619, + "learning_rate": 0.00016577148706018328, + "loss": 0.8866, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3635525623502143, + "learning_rate": 0.00016498434713559088, + "loss": 0.6836, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.44119747695330935, + "learning_rate": 0.00016419017501926656, + "loss": 0.9032, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.3634053428885579, + "learning_rate": 0.0001633890566514535, + "loss": 0.7741, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.40074048786281563, + "learning_rate": 0.00016258107872407375, + "loss": 0.7567, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3710119961265731, + "learning_rate": 0.0001617663286713474, + "loss": 0.7438, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.43659390815943544, + "learning_rate": 0.00016094489466033043, + "loss": 0.8229, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.44089643664019057, + "learning_rate": 0.00016011686558137448, + "loss": 0.8717, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.4882390118058521, + "learning_rate": 0.0001592823310385073, + "loss": 0.8413, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3892151483766946, + "learning_rate": 0.0001584413813397364, + "loss": 0.7386, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.45962800562606665, + "learning_rate": 0.00015759410748727662, + "loss": 0.8214, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.36844123484513785, + "learning_rate": 0.00015674060116770236, + "loss": 0.7762, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.393973862273754, + "learning_rate": 0.00015588095474202595, + "loss": 0.7748, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.4068913984751016, + "learning_rate": 0.00015501526123570277, + "loss": 0.7453, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4198609640121107, + "learning_rate": 0.00015414361432856475, + "loss": 0.8147, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.44435427505636294, + "learning_rate": 0.0001532661083446829, + "loss": 0.7834, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.4240365058162843, + "learning_rate": 0.00015238283824216015, + "loss": 0.8124, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.43581310205188567, + "learning_rate": 0.00015149389960285558, + "loss": 0.7962, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.4067616506759109, + "learning_rate": 0.00015059938862204127, + "loss": 0.7642, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4389221538921379, + "learning_rate": 0.00014969940209799248, + "loss": 0.8329, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4292143785202815, + "learning_rate": 0.00014879403742151283, + "loss": 0.8139, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.41683271568461383, + "learning_rate": 0.00014788339256539544, + "loss": 0.7463, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.41560250231469886, + "learning_rate": 0.0001469675660738206, + "loss": 0.8365, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.4265526045880755, + "learning_rate": 0.00014604665705169237, + "loss": 0.7318, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4125340931465266, + "learning_rate": 0.00014512076515391375, + "loss": 0.7438, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.41380660062293606, + "learning_rate": 0.00014418999057460276, + "loss": 0.7589, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4027084665611716, + "learning_rate": 0.0001432544340362501, + "loss": 0.7795, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4353731889281649, + "learning_rate": 0.00014231419677881966, + "loss": 0.8598, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.4740349413274777, + "learning_rate": 0.00014136938054879283, + "loss": 0.9333, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.40992972281306467, + "learning_rate": 0.00014042008758815818, + "loss": 0.8826, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.36347394284436363, + "learning_rate": 0.00013946642062334766, + "loss": 0.7407, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.404150662357857, + "learning_rate": 0.00013850848285411994, + "loss": 0.7737, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4278255533166568, + "learning_rate": 0.000137546377942393, + "loss": 0.8332, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.39436523824192266, + "learning_rate": 0.00013658021000102636, + "loss": 0.7932, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.40713229995212113, + "learning_rate": 0.00013561008358255468, + "loss": 0.82, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4310281065090191, + "learning_rate": 0.00013463610366787392, + "loss": 0.7957, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3613968258945572, + "learning_rate": 0.00013365837565488064, + "loss": 0.7372, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3800431375399143, + "learning_rate": 0.0001326770053470668, + "loss": 0.7416, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3849926632349601, + "learning_rate": 0.0001316920989420703, + "loss": 0.712, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3827741004319946, + "learning_rate": 0.00013070376302018287, + "loss": 0.7685, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4184348035492567, + "learning_rate": 0.00012971210453281674, + "loss": 0.8409, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.42952809627796723, + "learning_rate": 0.000128717230790931, + "loss": 0.782, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3885255772037007, + "learning_rate": 0.00012771924945341906, + "loss": 0.7796, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.4177816693102137, + "learning_rate": 0.00012671826851545851, + "loss": 0.8373, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4252103124047141, + "learning_rate": 0.0001257143962968246, + "loss": 0.7054, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.39960643857322153, + "learning_rate": 0.00012470774143016853, + "loss": 0.7574, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4020654231178936, + "learning_rate": 0.00012369841284926188, + "loss": 0.8206, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.42372134548147655, + "learning_rate": 0.00012268651977720866, + "loss": 0.8005, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.4045781402837269, + "learning_rate": 0.00012167217171462566, + "loss": 0.822, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.4435646118102826, + "learning_rate": 0.0001206554784277931, + "loss": 0.8393, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3828792544162484, + "learning_rate": 0.00011963654993677645, + "loss": 0.7898, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.38080024429440795, + "learning_rate": 0.00011861549650352069, + "loss": 0.7732, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4056355024308322, + "learning_rate": 0.00011759242861991855, + "loss": 0.7626, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.3882052147935702, + "learning_rate": 0.00011656745699585371, + "loss": 0.7952, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.45730059206674206, + "learning_rate": 0.00011554069254722051, + "loss": 0.8217, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3974489084438826, + "learning_rate": 0.00011451224638392129, + "loss": 0.7611, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3797558562347122, + "learning_rate": 0.00011348222979784289, + "loss": 0.8435, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.358948721145034, + "learning_rate": 0.00011245075425081328, + "loss": 0.7286, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.3782538505103814, + "learning_rate": 0.00011141793136253986, + "loss": 0.7428, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.39267784239042214, + "learning_rate": 0.0001103838728985307, + "loss": 0.787, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.42146096754745493, + "learning_rate": 0.000109348690758, + "loss": 0.789, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.36020480638676666, + "learning_rate": 0.00010831249696175918, + "loss": 0.7271, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.37430245737142476, + "learning_rate": 0.0001072754036400944, + "loss": 0.8252, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.38913045174749195, + "learning_rate": 0.00010623752302063283, + "loss": 0.773, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.39960922244897257, + "learning_rate": 0.00010519896741619803, + "loss": 0.7155, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.39641619234048997, + "learning_rate": 0.00010415984921265609, + "loss": 0.7896, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.33652474109408925, + "learning_rate": 0.00010312028085675391, + "loss": 0.7171, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3941859869494838, + "learning_rate": 0.00010208037484395114, + "loss": 0.7855, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.35618008718785404, + "learning_rate": 0.00010104024370624644, + "loss": 0.7337, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4131684190723905, + "learning_rate": 0.0001, + "loss": 0.7677, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4082194849724162, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7892, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3989600006708164, + "learning_rate": 9.791962515604887e-05, + "loss": 0.8089, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.41631670995914843, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7854, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.402657543866696, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7787, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4107566999788711, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7596, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.41566952699593074, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7629, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.5342359883614851, + "learning_rate": 9.272459635990562e-05, + "loss": 0.9196, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3609183839419, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7353, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.3565874591187865, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7193, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.3444424796083395, + "learning_rate": 8.961612710146934e-05, + "loss": 0.697, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.41961617081613556, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8216, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3927724673868508, + "learning_rate": 8.754924574918675e-05, + "loss": 0.8328, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3711041396342585, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7837, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.42945797533025126, + "learning_rate": 8.548775361607872e-05, + "loss": 0.8349, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.44056918352960456, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7856, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.34843951779969207, + "learning_rate": 8.343254300414628e-05, + "loss": 0.6821, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3888506113430166, + "learning_rate": 8.240757138008149e-05, + "loss": 0.8158, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.37766985513771145, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7402, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.4020630450815043, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7494, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.377081441672366, + "learning_rate": 7.934452157220694e-05, + "loss": 0.7844, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.40450245428415776, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8174, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.39493090616168064, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7985, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.41675679202293997, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8159, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.3931281130361976, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7521, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.6042502120663296, + "learning_rate": 7.428560370317542e-05, + "loss": 0.7507, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.47468697932857046, + "learning_rate": 7.328173148454151e-05, + "loss": 0.9119, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.38550459532969644, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7628, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.36495206218397463, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7743, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.4188243954824607, + "learning_rate": 7.028789546718326e-05, + "loss": 0.8073, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3699670401294648, + "learning_rate": 6.929623697981718e-05, + "loss": 0.727, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3801784175823091, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7446, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3973466051960255, + "learning_rate": 6.732299465293322e-05, + "loss": 0.8147, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3550433416895208, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7358, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.3593347476207881, + "learning_rate": 6.536389633212609e-05, + "loss": 0.6282, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.6312465616911078, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8337, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.4012296319663537, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7803, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3549863053806089, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7183, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4047234928323414, + "learning_rate": 6.149151714588009e-05, + "loss": 0.8044, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3626160894555547, + "learning_rate": 6.053357937665237e-05, + "loss": 0.756, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.41467079082946373, + "learning_rate": 5.957991241184184e-05, + "loss": 0.8027, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4075587028092342, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7418, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3830976241692011, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7419, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.42143030433733797, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7973, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.35952121482881066, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.685, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3719557223049432, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7693, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.4530552292723754, + "learning_rate": 5.395334294830765e-05, + "loss": 0.8558, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.404333362478301, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7737, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3602593506741611, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7277, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.43170347457251695, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.8079, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.38157460056469383, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7265, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3787607400303287, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7648, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.37812142777908264, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7608, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.4105230234147243, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7745, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3758126653197388, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7584, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.36754219735668736, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7662, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3894194421895725, + "learning_rate": 4.498473876429726e-05, + "loss": 0.7093, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.357425964555511, + "learning_rate": 4.411904525797408e-05, + "loss": 0.683, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3534371862254715, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7069, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.40921866361886605, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7862, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3634675831446619, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7012, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.38769159694516386, + "learning_rate": 4.071766896149273e-05, + "loss": 0.8184, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3883328519290395, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6963, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.4039785395114772, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.8165, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.36626014714342764, + "learning_rate": 3.823367132865265e-05, + "loss": 0.6868, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.36486252134168545, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7548, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.37920248619014446, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.75, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3927655551690742, + "learning_rate": 3.580982498073344e-05, + "loss": 0.7159, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4171823450293806, + "learning_rate": 3.501565286440914e-05, + "loss": 0.7247, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.43052610498046595, + "learning_rate": 3.422851293981676e-05, + "loss": 0.763, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.37889611789663313, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.7687, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.382653293227921, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.7021, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.36566488831348215, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7679, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3694354834019595, + "learning_rate": 3.115196713638e-05, + "loss": 0.7395, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.35908269164475143, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7337, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.39016941968285335, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.772, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.39819859786786, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7805, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.39185642378960983, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.6778, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.401622643081188, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.8092, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.3920103368441694, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7962, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.48709130599059774, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7285, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.40376554364298234, + "learning_rate": 2.536156105513062e-05, + "loss": 0.8016, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.36837916085597394, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6855, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.4483714742264585, + "learning_rate": 2.399319354583418e-05, + "loss": 0.7492, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.3751255820392184, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.724, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.46753824670719196, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7511, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.36924248073327026, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.6697, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4311324864533044, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.8389, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3468847141233154, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.6796, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.40305637231032954, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7589, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.37006132966778144, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7222, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3828599194296075, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7471, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.42119470659162916, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.8019, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4240891071356015, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.7722, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.343317961799583, + "learning_rate": 1.707039794428259e-05, + "loss": 0.681, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3858696373590418, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7242, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3852297204123048, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.82, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3660298327029085, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7614, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3501196050916281, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7489, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.4312933762780563, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7936, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.38094591751083784, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.8143, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.4304063885736067, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.8567, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3530318575675933, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7033, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.34922822590438607, + "learning_rate": 1.220944973160133e-05, + "loss": 0.6877, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.3526678283487226, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.7317, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.7094102415989282, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.8096, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.41180672382392763, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7674, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3797380672648192, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7135, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3666105727289888, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7116, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.3939067802745871, + "learning_rate": 9.393660536564408e-06, + "loss": 0.8032, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3715620422004267, + "learning_rate": 8.958392187916841e-06, + "loss": 0.7724, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.3931537314827726, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7537, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.35281398953949933, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7231, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.38182168546498296, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7483, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.3657635352324864, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.6833, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3824285828193031, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7575, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.37882674471517297, + "learning_rate": 6.555246550469907e-06, + "loss": 0.736, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.37706586346203985, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7194, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.37702088330630684, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7112, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.345197042865683, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7507, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3952886290968386, + "learning_rate": 5.154805790456485e-06, + "loss": 0.806, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4156637546454637, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7324, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.4181202383313785, + "learning_rate": 4.516017865659949e-06, + "loss": 0.7611, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.39953806832811245, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7984, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.39038420791176903, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7821, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3171140666423134, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.598, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3671672595983185, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7397, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3494979584858635, + "learning_rate": 3.100425628282899e-06, + "loss": 0.6832, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3739816532239034, + "learning_rate": 2.848647830172024e-06, + "loss": 0.7122, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.3434731826805483, + "learning_rate": 2.607383131993424e-06, + "loss": 0.71, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.4175439167844619, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7685, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.38016444223431994, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7305, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.3816533198845126, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7436, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.36357433955019786, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7354, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.38398526161124624, + "learning_rate": 1.559629951665298e-06, + "loss": 0.8063, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.39414628772972166, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7807, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.35210866914929495, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7157, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.45679210106521884, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7635, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.35302500589097974, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7143, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.5043013214314466, + "learning_rate": 7.781338686584927e-07, + "loss": 0.6692, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3901330300448095, + "learning_rate": 6.539842600603918e-07, + "loss": 0.7647, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.41218713945684576, + "learning_rate": 5.405852438937764e-07, + "loss": 0.8084, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.38756210150749887, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7732, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3535259267039688, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6677, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.41043791682590347, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.8145, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3955724908252904, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7789, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3897792547207055, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7572, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3849603185219187, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7607, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.4033374013090449, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7728, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.4085978341355722, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7689, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4317191681124255, + "learning_rate": 5.410681219286673e-09, + "loss": 0.8225, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.36886491969634516, + "learning_rate": 0.0, + "loss": 0.6977, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 273275681636352.0, + "train_loss": 0.7957936550180117, + "train_runtime": 4889.7546, + "train_samples_per_second": 1.023, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 273275681636352.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..13d9060160fbad1bafb8b6da3eeb56ca6bbea573 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "v_proj", + "o_proj", + "q_proj", + "down_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..08c0d23e21dea8488e5be9c6158f457e9b83b4e0 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8c25b431e0d66526adddf63874623da42b2b8345a74cd589a4adbbdf0ed2c8d +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..f06302ebff3dfafdaacc4077ed7c2312ef7b3825 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11a5ee48a4cdd5e71d24c8b129ac499b66de672d387a7626163c2ede608c6bd1 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c8f1cafbfac42eeea479536c7ac86ccf432f8465 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_2_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.7353335647051229, + "learning_rate": 2e-05, + "loss": 1.2076, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7279787019415223, + "learning_rate": 4e-05, + "loss": 1.2524, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7010361557438148, + "learning_rate": 6e-05, + "loss": 1.2511, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6687401924085121, + "learning_rate": 8e-05, + "loss": 1.2113, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.7346425995604949, + "learning_rate": 0.0001, + "loss": 1.1487, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.756184601542448, + "learning_rate": 0.00012, + "loss": 1.1107, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8095698213691533, + "learning_rate": 0.00014, + "loss": 0.9564, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7222059057957128, + "learning_rate": 0.00016, + "loss": 0.9622, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5791907109692352, + "learning_rate": 0.00018, + "loss": 0.9456, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.4755161585748817, + "learning_rate": 0.0002, + "loss": 0.8847, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.534405283597521, + "learning_rate": 0.00019999458931878073, + "loss": 1.023, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.48687859353694035, + "learning_rate": 0.0001999783578606323, + "loss": 0.9897, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5569660910880551, + "learning_rate": 0.00019995130738201966, + "loss": 0.9866, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5616330007320514, + "learning_rate": 0.0001999134408101731, + "loss": 0.998, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.5611217307031949, + "learning_rate": 0.00019986476224277165, + "loss": 0.8927, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5370487900027019, + "learning_rate": 0.00019980527694749952, + "loss": 0.9287, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.46385921148664616, + "learning_rate": 0.00019973499136147606, + "loss": 0.8463, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5051655153816611, + "learning_rate": 0.0001996539130905593, + "loss": 0.9266, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4733629776054302, + "learning_rate": 0.0001995620509085228, + "loss": 0.87, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.45292377837228687, + "learning_rate": 0.00019945941475610623, + "loss": 0.887, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.46552242008280886, + "learning_rate": 0.0001993460157399396, + "loss": 0.8795, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.46418388572124936, + "learning_rate": 0.0001992218661313415, + "loss": 0.887, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.42205968841775676, + "learning_rate": 0.00019908697936499103, + "loss": 0.7814, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.47135505086732565, + "learning_rate": 0.00019894137003747403, + "loss": 0.8869, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.4773649704916738, + "learning_rate": 0.00019878505390570362, + "loss": 0.8908, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.45081311668010043, + "learning_rate": 0.00019861804788521493, + "loss": 0.9113, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.4480881064139389, + "learning_rate": 0.00019844037004833473, + "loss": 0.932, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4367083903922008, + "learning_rate": 0.00019825203962222572, + "loss": 0.8035, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.44252534295962925, + "learning_rate": 0.0001980530769868059, + "loss": 0.876, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.4187006380577945, + "learning_rate": 0.00019784350367254322, + "loss": 0.884, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3852515402302905, + "learning_rate": 0.0001976233423581255, + "loss": 0.8389, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.39441356614834566, + "learning_rate": 0.0001973926168680066, + "loss": 0.8309, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.45323199266429537, + "learning_rate": 0.00019715135216982798, + "loss": 0.9031, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.4203601208851421, + "learning_rate": 0.0001968995743717171, + "loss": 0.8488, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.39400991555307874, + "learning_rate": 0.00019663731071946206, + "loss": 0.8236, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4078190992380361, + "learning_rate": 0.00019636458959356316, + "loss": 0.8059, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5571567140998109, + "learning_rate": 0.0001960814405061619, + "loss": 0.8765, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4339029798097146, + "learning_rate": 0.00019578789409784727, + "loss": 0.8513, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.49432651381587406, + "learning_rate": 0.00019548398213434007, + "loss": 0.9171, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.44088058900747906, + "learning_rate": 0.00019516973750305532, + "loss": 0.9415, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4769524240996268, + "learning_rate": 0.00019484519420954354, + "loss": 0.9368, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.49387217949165196, + "learning_rate": 0.00019451038737381077, + "loss": 0.9142, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.40446228014333374, + "learning_rate": 0.00019416535322651818, + "loss": 0.8133, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.44322829688845194, + "learning_rate": 0.00019381012910506146, + "loss": 0.9206, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.38951565881037564, + "learning_rate": 0.00019344475344953012, + "loss": 0.7707, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.43181804382854094, + "learning_rate": 0.00019306926579854821, + "loss": 0.8291, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4073163231966112, + "learning_rate": 0.00019268370678499533, + "loss": 0.8037, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.5006944874836005, + "learning_rate": 0.0001922881181316097, + "loss": 0.8234, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.41216712583716064, + "learning_rate": 0.00019188254264647337, + "loss": 0.823, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.40904550226721814, + "learning_rate": 0.0001914670242183795, + "loss": 0.7697, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.45468333701316305, + "learning_rate": 0.0001910416078120832, + "loss": 0.8864, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4302613251664346, + "learning_rate": 0.0001906063394634356, + "loss": 0.751, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.33829052241502605, + "learning_rate": 0.00019016126627440237, + "loss": 0.6905, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4320963001746247, + "learning_rate": 0.00018970643640796642, + "loss": 0.7661, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.3898845794735546, + "learning_rate": 0.000189241899082916, + "loss": 0.7615, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.43418724158248473, + "learning_rate": 0.00018876770456851877, + "loss": 0.8216, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.4026011963114364, + "learning_rate": 0.0001882839041790818, + "loss": 0.8503, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.4739645096526827, + "learning_rate": 0.00018779055026839868, + "loss": 0.7476, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.41850488712881967, + "learning_rate": 0.00018728769622408423, + "loss": 0.8024, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.4709244414897302, + "learning_rate": 0.00018677539646179707, + "loss": 0.8109, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3693587269306096, + "learning_rate": 0.00018625370641935129, + "loss": 0.769, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.47301661699875125, + "learning_rate": 0.00018572268255071718, + "loss": 0.889, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4386889037604545, + "learning_rate": 0.00018518238231991218, + "loss": 0.8279, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.43011697191812426, + "learning_rate": 0.00018463286419478255, + "loss": 0.7882, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.48661858269251446, + "learning_rate": 0.00018407418764067627, + "loss": 0.8426, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.40247048922777995, + "learning_rate": 0.00018350641311400812, + "loss": 0.8584, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4508726830470603, + "learning_rate": 0.0001829296020557174, + "loss": 0.8058, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4358043323296017, + "learning_rate": 0.00018234381688461942, + "loss": 0.782, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.44946172263886064, + "learning_rate": 0.0001817491209906506, + "loss": 0.8252, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.4101863243055401, + "learning_rate": 0.00018114557872800905, + "loss": 0.765, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3847259325031979, + "learning_rate": 0.00018053325540819045, + "loss": 0.8052, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.4468868011811026, + "learning_rate": 0.0001799122172929206, + "loss": 0.8765, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3951583426923667, + "learning_rate": 0.00017928253158698473, + "loss": 0.788, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3985087499811743, + "learning_rate": 0.0001786442664309554, + "loss": 0.7706, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.423798419258559, + "learning_rate": 0.0001779974908938184, + "loss": 0.8708, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.374327382153691, + "learning_rate": 0.0001773422749654988, + "loss": 0.7646, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3989217108444149, + "learning_rate": 0.00017667868954928694, + "loss": 0.7928, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4203983455730358, + "learning_rate": 0.00017600680645416583, + "loss": 0.7771, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.41484946201589906, + "learning_rate": 0.00017532669838704035, + "loss": 0.8396, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.38867408389341773, + "learning_rate": 0.00017463843894486937, + "loss": 0.7985, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4190613279860536, + "learning_rate": 0.0001739421026067017, + "loss": 0.7909, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.39446934721916893, + "learning_rate": 0.00017323776472561627, + "loss": 0.8379, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.4530288411101365, + "learning_rate": 0.00017252550152056795, + "loss": 0.8267, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.38791637036738574, + "learning_rate": 0.0001718053900681397, + "loss": 0.7791, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.4163205325536259, + "learning_rate": 0.00017107750829420176, + "loss": 0.8395, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4336427847797014, + "learning_rate": 0.00017034193496547902, + "loss": 0.86, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.43859603028051736, + "learning_rate": 0.00016959874968102735, + "loss": 0.837, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4796050483752347, + "learning_rate": 0.00016884803286362, + "loss": 0.8337, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4148636880540723, + "learning_rate": 0.00016808986575104465, + "loss": 0.8407, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.4278239391229428, + "learning_rate": 0.00016732433038731242, + "loss": 0.8607, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4171106509125376, + "learning_rate": 0.0001665515096137797, + "loss": 0.7848, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.43952389800771496, + "learning_rate": 0.00016577148706018328, + "loss": 0.8784, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.4110890120327196, + "learning_rate": 0.00016498434713559088, + "loss": 0.7378, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4016951197596061, + "learning_rate": 0.00016419017501926656, + "loss": 0.7629, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.4107735649609541, + "learning_rate": 0.0001633890566514535, + "loss": 0.7876, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3975769330362091, + "learning_rate": 0.00016258107872407375, + "loss": 0.7735, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.41267371328534225, + "learning_rate": 0.0001617663286713474, + "loss": 0.7443, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3953945159539454, + "learning_rate": 0.00016094489466033043, + "loss": 0.8453, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.43071752234101507, + "learning_rate": 0.00016011686558137448, + "loss": 0.8352, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.4493749509773131, + "learning_rate": 0.0001592823310385073, + "loss": 0.8775, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.40149997325603126, + "learning_rate": 0.0001584413813397364, + "loss": 0.7865, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.5476453056261936, + "learning_rate": 0.00015759410748727662, + "loss": 0.8126, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.39586801827264484, + "learning_rate": 0.00015674060116770236, + "loss": 0.7457, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.5646707566961765, + "learning_rate": 0.00015588095474202595, + "loss": 0.7555, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.4475995669275955, + "learning_rate": 0.00015501526123570277, + "loss": 0.797, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4313869191817502, + "learning_rate": 0.00015414361432856475, + "loss": 0.7591, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.3882927614545473, + "learning_rate": 0.0001532661083446829, + "loss": 0.7339, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3844877948455331, + "learning_rate": 0.00015238283824216015, + "loss": 0.7714, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.4447936789010439, + "learning_rate": 0.00015149389960285558, + "loss": 0.8044, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.3659818159150816, + "learning_rate": 0.00015059938862204127, + "loss": 0.8186, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.48371725315043806, + "learning_rate": 0.00014969940209799248, + "loss": 0.8727, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4236883958628823, + "learning_rate": 0.00014879403742151283, + "loss": 0.8057, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4089980647537749, + "learning_rate": 0.00014788339256539544, + "loss": 0.7718, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.505941391374786, + "learning_rate": 0.0001469675660738206, + "loss": 0.8456, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.4823477816929391, + "learning_rate": 0.00014604665705169237, + "loss": 0.8652, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.46891506565573476, + "learning_rate": 0.00014512076515391375, + "loss": 0.785, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.6453908453138354, + "learning_rate": 0.00014418999057460276, + "loss": 0.8004, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.39156015317884774, + "learning_rate": 0.0001432544340362501, + "loss": 0.7909, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4022394970785278, + "learning_rate": 0.00014231419677881966, + "loss": 0.7843, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.4511108310970962, + "learning_rate": 0.00014136938054879283, + "loss": 0.8327, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.453833054764165, + "learning_rate": 0.00014042008758815818, + "loss": 0.8107, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3892486907170623, + "learning_rate": 0.00013946642062334766, + "loss": 0.7089, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.36810555047180393, + "learning_rate": 0.00013850848285411994, + "loss": 0.7515, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3945608101150715, + "learning_rate": 0.000137546377942393, + "loss": 0.802, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.4154686776912597, + "learning_rate": 0.00013658021000102636, + "loss": 0.7985, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.42640254320918586, + "learning_rate": 0.00013561008358255468, + "loss": 0.7384, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.42327914519915, + "learning_rate": 0.00013463610366787392, + "loss": 0.7377, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3929897707814567, + "learning_rate": 0.00013365837565488064, + "loss": 0.7007, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.44007030909080774, + "learning_rate": 0.0001326770053470668, + "loss": 0.8315, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.42831386146949785, + "learning_rate": 0.0001316920989420703, + "loss": 0.7876, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.45712385909551956, + "learning_rate": 0.00013070376302018287, + "loss": 0.7998, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.40301730558878535, + "learning_rate": 0.00012971210453281674, + "loss": 0.8143, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.44697077015053915, + "learning_rate": 0.000128717230790931, + "loss": 0.7987, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.366128484726172, + "learning_rate": 0.00012771924945341906, + "loss": 0.7081, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.5013227951492534, + "learning_rate": 0.00012671826851545851, + "loss": 0.7745, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3583956153339671, + "learning_rate": 0.0001257143962968246, + "loss": 0.7543, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.405702524328237, + "learning_rate": 0.00012470774143016853, + "loss": 0.7804, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.41039096959060223, + "learning_rate": 0.00012369841284926188, + "loss": 0.8108, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.4238385217492043, + "learning_rate": 0.00012268651977720866, + "loss": 0.7638, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.4337671096969143, + "learning_rate": 0.00012167217171462566, + "loss": 0.7978, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.42543365056666493, + "learning_rate": 0.0001206554784277931, + "loss": 0.7997, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.42365486991291884, + "learning_rate": 0.00011963654993677645, + "loss": 0.81, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4250077934117738, + "learning_rate": 0.00011861549650352069, + "loss": 0.749, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.39997182540477255, + "learning_rate": 0.00011759242861991855, + "loss": 0.7409, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.4246823162495634, + "learning_rate": 0.00011656745699585371, + "loss": 0.7807, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.42405231705778645, + "learning_rate": 0.00011554069254722051, + "loss": 0.8486, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3746023786640388, + "learning_rate": 0.00011451224638392129, + "loss": 0.728, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4029687500995392, + "learning_rate": 0.00011348222979784289, + "loss": 0.7975, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.39955296906769533, + "learning_rate": 0.00011245075425081328, + "loss": 0.7628, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.5451790165621929, + "learning_rate": 0.00011141793136253986, + "loss": 0.7423, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.44503423354162813, + "learning_rate": 0.0001103838728985307, + "loss": 0.8514, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4143063174690902, + "learning_rate": 0.000109348690758, + "loss": 0.8138, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.39587098997532205, + "learning_rate": 0.00010831249696175918, + "loss": 0.7566, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.40056336455504077, + "learning_rate": 0.0001072754036400944, + "loss": 0.7906, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.40444250822644906, + "learning_rate": 0.00010623752302063283, + "loss": 0.7907, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.41711976704966636, + "learning_rate": 0.00010519896741619803, + "loss": 0.7519, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.4025532438690134, + "learning_rate": 0.00010415984921265609, + "loss": 0.7688, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3938260305761322, + "learning_rate": 0.00010312028085675391, + "loss": 0.7984, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.38295899277573886, + "learning_rate": 0.00010208037484395114, + "loss": 0.7649, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.40748226134928633, + "learning_rate": 0.00010104024370624644, + "loss": 0.7888, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4147850854909231, + "learning_rate": 0.0001, + "loss": 0.8724, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.42902160057483835, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7562, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5612209341617765, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7756, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.41655921601946894, + "learning_rate": 9.687971914324607e-05, + "loss": 0.7856, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.42825407866222126, + "learning_rate": 9.584015078734395e-05, + "loss": 0.8294, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.38306567130243957, + "learning_rate": 9.480103258380198e-05, + "loss": 0.7642, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.49876992207725157, + "learning_rate": 9.376247697936719e-05, + "loss": 0.8447, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4736142406610525, + "learning_rate": 9.272459635990562e-05, + "loss": 0.9073, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3918865694655684, + "learning_rate": 9.168750303824084e-05, + "loss": 0.7816, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.3664553849638891, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7516, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.39712600325232983, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7371, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4600571784465661, + "learning_rate": 8.858206863746018e-05, + "loss": 0.8381, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3628261375645332, + "learning_rate": 8.754924574918675e-05, + "loss": 0.747, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.393616467287087, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7235, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.47617434563057714, + "learning_rate": 8.548775361607872e-05, + "loss": 0.8087, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.40997243707815617, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7951, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3801073410542062, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7078, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.8011285925365285, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7284, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3863622880101341, + "learning_rate": 8.138450349647936e-05, + "loss": 0.7404, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.4295315822819754, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7839, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3966157719283058, + "learning_rate": 7.934452157220694e-05, + "loss": 0.754, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.44437922960807463, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8257, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4287544680995954, + "learning_rate": 7.731348022279134e-05, + "loss": 0.8008, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4347600038177852, + "learning_rate": 7.630158715073813e-05, + "loss": 0.8624, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.3863039786771903, + "learning_rate": 7.52922585698315e-05, + "loss": 0.8026, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3632768447767422, + "learning_rate": 7.428560370317542e-05, + "loss": 0.6955, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4579704147896861, + "learning_rate": 7.328173148454151e-05, + "loss": 0.928, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.35567333575144844, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7131, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.42351725520340977, + "learning_rate": 7.1282769209069e-05, + "loss": 0.8225, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.3695258935779004, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7611, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3616084000908627, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7444, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3640094661089709, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7047, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.4105001115396971, + "learning_rate": 6.732299465293322e-05, + "loss": 0.7566, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3801573705419409, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7274, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.3735136572378132, + "learning_rate": 6.536389633212609e-05, + "loss": 0.7516, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3859130222934957, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7686, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.42009825247089627, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7361, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.40298557972576315, + "learning_rate": 6.245362205760704e-05, + "loss": 0.733, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.40950437300610265, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7993, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3608475513637045, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7089, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.44061580725734156, + "learning_rate": 5.957991241184184e-05, + "loss": 0.8313, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.4447650079537509, + "learning_rate": 5.863061945120719e-05, + "loss": 0.782, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3622557561934682, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7714, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.404647783044088, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7748, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.38394711987452385, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.7597, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3727936161926316, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7853, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.49024845843748677, + "learning_rate": 5.395334294830765e-05, + "loss": 0.8499, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.4037226593320179, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7434, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.37262038078797616, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7668, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.3764820116836066, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7508, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3740403474660709, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7133, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3911675870000252, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7331, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.44092476422884314, + "learning_rate": 4.850610039714444e-05, + "loss": 0.803, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5965084946433509, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7782, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3694223849820989, + "learning_rate": 4.673389165531714e-05, + "loss": 0.696, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.39835566672325723, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7607, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.40421830431345207, + "learning_rate": 4.498473876429726e-05, + "loss": 0.77, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.4327460763114331, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7976, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.36325155518235447, + "learning_rate": 4.325939883229766e-05, + "loss": 0.7052, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 1.0751296068478726, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7527, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3897182885312082, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7804, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.4169239593245994, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7822, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3691444779397978, + "learning_rate": 3.988313441862553e-05, + "loss": 0.7445, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.39625995775209094, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.6983, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.3881640929163733, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7913, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.39334899909767057, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7223, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.36526742592830835, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7562, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.4284379055555717, + "learning_rate": 3.580982498073344e-05, + "loss": 0.8153, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.3445180285265342, + "learning_rate": 3.501565286440914e-05, + "loss": 0.7111, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.39156607346352107, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7803, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.32983355633155836, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.6598, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3973990647192029, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.8145, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.39917217243119213, + "learning_rate": 3.191013424895536e-05, + "loss": 0.7809, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.35699613230012617, + "learning_rate": 3.115196713638e-05, + "loss": 0.7515, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.3757030510807036, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7572, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.39574927462744697, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7541, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3638118135822814, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6844, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3532576466352604, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.755, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.37812827082129896, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.7661, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.4000186907926742, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7648, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.38478863206079855, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7915, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.40839553211815444, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7842, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.36134207947659874, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6276, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.41146843594962335, + "learning_rate": 2.399319354583418e-05, + "loss": 0.8122, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.390594784718953, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.6959, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.417708087683773, + "learning_rate": 2.265772503450122e-05, + "loss": 0.7417, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.41770966842517177, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7427, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.45819003898106186, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.783, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3608229753200471, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7077, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.42750323157502884, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7661, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3737648119297154, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7622, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.42424402467674727, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7997, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.4251037432545757, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.8169, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4289372594381212, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.8064, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.3926113472204545, + "learning_rate": 1.707039794428259e-05, + "loss": 0.774, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.40609687366384595, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7674, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3816905938196553, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.7223, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.39092919615885185, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7598, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.38177892948495634, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7474, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.3904483662558189, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7693, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.39329544302846187, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.8033, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.41124017095318816, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.6987, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3843691621015497, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7485, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.38855507856424987, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7168, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.3507519796770734, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6648, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.37421219471051437, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.7689, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.35786148880826973, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.6801, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.40146892050862254, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.772, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.4372265680239677, + "learning_rate": 9.838733725597615e-06, + "loss": 0.8464, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.44612067779519543, + "learning_rate": 9.393660536564408e-06, + "loss": 0.7776, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.42670276262977314, + "learning_rate": 8.958392187916841e-06, + "loss": 0.8086, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.8227660388329809, + "learning_rate": 8.532975781620512e-06, + "loss": 0.6584, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3842207858258516, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7495, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.39540138571188127, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7686, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.36498305543668375, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7179, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3686666206373811, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6837, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.405537960570134, + "learning_rate": 6.555246550469907e-06, + "loss": 0.7392, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3732960016607687, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7265, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.4030197091110242, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7875, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.4213002026168049, + "learning_rate": 5.489612626189245e-06, + "loss": 0.7773, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3761131954115017, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7399, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4132882262141919, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7894, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.41423363619491843, + "learning_rate": 4.516017865659949e-06, + "loss": 0.8073, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.38761750502367714, + "learning_rate": 4.21210590215273e-06, + "loss": 0.7416, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.3839770008034293, + "learning_rate": 3.918559493838114e-06, + "loss": 0.724, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3599229423297392, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7067, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3888595749417921, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7476, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3443579491802449, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7092, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.44403754362186026, + "learning_rate": 2.848647830172024e-06, + "loss": 0.844, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.37608113786992864, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7038, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3983224257988678, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7299, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3914566399179377, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.7044, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.3916343065284849, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7835, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.7908788036382982, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.6602, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.41562052712483977, + "learning_rate": 1.559629951665298e-06, + "loss": 0.7497, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.38743588474530594, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7559, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.40247540341968846, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.7366, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3723632343743109, + "learning_rate": 1.05862996252597e-06, + "loss": 0.7587, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4800849762941837, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7571, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.34878360642444717, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7249, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3726060412123766, + "learning_rate": 6.539842600603918e-07, + "loss": 0.7418, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.40157548278742655, + "learning_rate": 5.405852438937764e-07, + "loss": 0.7364, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4187836441125278, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7549, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.360392159342072, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7027, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.3901673132932371, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7788, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3780802382891487, + "learning_rate": 1.947230525005006e-07, + "loss": 0.689, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4548628926236984, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.825, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.38820521164508304, + "learning_rate": 8.655918982689581e-08, + "loss": 0.8133, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.34923336157859275, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.6934, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.45316721827824846, + "learning_rate": 2.164213936770576e-08, + "loss": 0.8081, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.4166271000686725, + "learning_rate": 5.410681219286673e-09, + "loss": 0.8381, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.362295912386709, + "learning_rate": 0.0, + "loss": 0.7171, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 275545903136768.0, + "train_loss": 0.798310113640932, + "train_runtime": 4869.1433, + "train_samples_per_second": 1.027, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 275545903136768.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ca4ae2a30b01a6a37ff0b01e8938dc1aec098c4 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "gate_proj", + "up_proj", + "q_proj", + "v_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7789e5c33b4fd426afd5fb7f6fb426e9139c42c6 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ecf96555c51bf18186d1f114312478b63cc3766fb05ccc9a6132c1acae57b76 +size 671150064 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..b590d3a13a711a16bb44fb174d3f1fe2543bbf96 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d07deab2c1319d9c360d9e538320d34762c5ad339d8ac2930e59cfd62acd246 +size 918507402 diff --git a/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ca704958f6ba747d300fc60013919cfb68708d64 --- /dev/null +++ b/mixing_strategies/Weighted_4_3_3/bugsBunny-v1_1-Llama-3-8B-V-Weighted_4_3_3_dataset_5000_repeat_3_epochs_1_GA_2_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.7353840109320694, + "learning_rate": 2e-05, + "loss": 1.1834, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7209487970131268, + "learning_rate": 4e-05, + "loss": 1.2349, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7553956094521731, + "learning_rate": 6e-05, + "loss": 1.2734, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7039345337655484, + "learning_rate": 8e-05, + "loss": 1.2004, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.728834613513379, + "learning_rate": 0.0001, + "loss": 1.0934, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.8685229227681107, + "learning_rate": 0.00012, + "loss": 1.0323, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.802914133393135, + "learning_rate": 0.00014, + "loss": 0.994, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7739004688791559, + "learning_rate": 0.00016, + "loss": 0.9976, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.7902313870210783, + "learning_rate": 0.00018, + "loss": 0.931, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.593104883060895, + "learning_rate": 0.0002, + "loss": 0.9584, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5113190956320616, + "learning_rate": 0.00019999458931878073, + "loss": 0.9149, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5060503485009137, + "learning_rate": 0.0001999783578606323, + "loss": 0.911, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4382313868336973, + "learning_rate": 0.00019995130738201966, + "loss": 0.8071, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5922615784284019, + "learning_rate": 0.0001999134408101731, + "loss": 0.9103, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.7167160374888876, + "learning_rate": 0.00019986476224277165, + "loss": 0.9744, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.601282492598755, + "learning_rate": 0.00019980527694749952, + "loss": 0.9426, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4719083490978608, + "learning_rate": 0.00019973499136147606, + "loss": 0.8633, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.519145531179642, + "learning_rate": 0.0001996539130905593, + "loss": 0.8862, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.45537967551722425, + "learning_rate": 0.0001995620509085228, + "loss": 0.8969, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.4195977357742255, + "learning_rate": 0.00019945941475610623, + "loss": 0.8835, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.44714643731390347, + "learning_rate": 0.0001993460157399396, + "loss": 0.8649, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.44320940901514166, + "learning_rate": 0.0001992218661313415, + "loss": 0.8439, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.45330036496349835, + "learning_rate": 0.00019908697936499103, + "loss": 0.8766, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.42451975536011216, + "learning_rate": 0.00019894137003747403, + "loss": 0.8387, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.43976281966016145, + "learning_rate": 0.00019878505390570362, + "loss": 0.8362, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.45816566151513477, + "learning_rate": 0.00019861804788521493, + "loss": 0.8274, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.45606272179204993, + "learning_rate": 0.00019844037004833473, + "loss": 0.9505, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4381119339418824, + "learning_rate": 0.00019825203962222572, + "loss": 0.8236, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4799753876158708, + "learning_rate": 0.0001980530769868059, + "loss": 0.8139, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.45868912405964, + "learning_rate": 0.00019784350367254322, + "loss": 0.9248, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.4152018270320052, + "learning_rate": 0.0001976233423581255, + "loss": 0.8015, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4825359832052692, + "learning_rate": 0.0001973926168680066, + "loss": 0.875, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5168068629984979, + "learning_rate": 0.00019715135216982798, + "loss": 0.9829, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.43712766079753224, + "learning_rate": 0.0001968995743717171, + "loss": 0.8483, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.46563267033392625, + "learning_rate": 0.00019663731071946206, + "loss": 0.828, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4663166112787982, + "learning_rate": 0.00019636458959356316, + "loss": 0.8483, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4484015420161677, + "learning_rate": 0.0001960814405061619, + "loss": 0.8452, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.42753376993128667, + "learning_rate": 0.00019578789409784727, + "loss": 0.766, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4862274405553227, + "learning_rate": 0.00019548398213434007, + "loss": 0.8625, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.4546399694854664, + "learning_rate": 0.00019516973750305532, + "loss": 0.9057, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5223692563511014, + "learning_rate": 0.00019484519420954354, + "loss": 0.9527, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.47122827916858195, + "learning_rate": 0.00019451038737381077, + "loss": 0.8754, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.42665585601481504, + "learning_rate": 0.00019416535322651818, + "loss": 0.8565, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.43940448209185695, + "learning_rate": 0.00019381012910506146, + "loss": 0.8715, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.3973470431891018, + "learning_rate": 0.00019344475344953012, + "loss": 0.81, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4372037210936052, + "learning_rate": 0.00019306926579854821, + "loss": 0.8431, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.41326954936737587, + "learning_rate": 0.00019268370678499533, + "loss": 0.8158, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4110024162222225, + "learning_rate": 0.0001922881181316097, + "loss": 0.8194, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.40563186797054557, + "learning_rate": 0.00019188254264647337, + "loss": 0.7973, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.4265169122791336, + "learning_rate": 0.0001914670242183795, + "loss": 0.8859, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.44751438315175346, + "learning_rate": 0.0001910416078120832, + "loss": 0.8217, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.44849247915843415, + "learning_rate": 0.0001906063394634356, + "loss": 0.7821, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4239607699371837, + "learning_rate": 0.00019016126627440237, + "loss": 0.743, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.4472801716671858, + "learning_rate": 0.00018970643640796642, + "loss": 0.8384, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.4005685576060965, + "learning_rate": 0.000189241899082916, + "loss": 0.7432, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4242075729779606, + "learning_rate": 0.00018876770456851877, + "loss": 0.8966, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.40286050662034556, + "learning_rate": 0.0001882839041790818, + "loss": 0.7687, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.40660980251274575, + "learning_rate": 0.00018779055026839868, + "loss": 0.7573, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.39606993921713546, + "learning_rate": 0.00018728769622408423, + "loss": 0.7909, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.379037677841038, + "learning_rate": 0.00018677539646179707, + "loss": 0.7575, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4224876839394266, + "learning_rate": 0.00018625370641935129, + "loss": 0.7678, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.40588337794375384, + "learning_rate": 0.00018572268255071718, + "loss": 0.8066, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3845362324651911, + "learning_rate": 0.00018518238231991218, + "loss": 0.7523, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4381708599126164, + "learning_rate": 0.00018463286419478255, + "loss": 0.8432, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.45276926242831855, + "learning_rate": 0.00018407418764067627, + "loss": 0.8639, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.37051457899931317, + "learning_rate": 0.00018350641311400812, + "loss": 0.7904, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.40990165361568964, + "learning_rate": 0.0001829296020557174, + "loss": 0.8222, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4183200084135454, + "learning_rate": 0.00018234381688461942, + "loss": 0.7671, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.432988942689945, + "learning_rate": 0.0001817491209906506, + "loss": 0.8457, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.4453566315058246, + "learning_rate": 0.00018114557872800905, + "loss": 0.7248, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.42925663557577415, + "learning_rate": 0.00018053325540819045, + "loss": 0.7903, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5220117016806057, + "learning_rate": 0.0001799122172929206, + "loss": 0.8751, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.40668797698470954, + "learning_rate": 0.00017928253158698473, + "loss": 0.7212, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.40055891363920837, + "learning_rate": 0.0001786442664309554, + "loss": 0.8155, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.39726475648010184, + "learning_rate": 0.0001779974908938184, + "loss": 0.7836, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.45508866275878007, + "learning_rate": 0.0001773422749654988, + "loss": 0.824, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.41489798086407326, + "learning_rate": 0.00017667868954928694, + "loss": 0.7773, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.42777300712080363, + "learning_rate": 0.00017600680645416583, + "loss": 0.8128, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.42703268648937903, + "learning_rate": 0.00017532669838704035, + "loss": 0.8463, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.39515846824029793, + "learning_rate": 0.00017463843894486937, + "loss": 0.7655, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3688518956481181, + "learning_rate": 0.0001739421026067017, + "loss": 0.756, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.42366848797339224, + "learning_rate": 0.00017323776472561627, + "loss": 0.8382, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5243105173935234, + "learning_rate": 0.00017252550152056795, + "loss": 0.871, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.39167002909509857, + "learning_rate": 0.0001718053900681397, + "loss": 0.7633, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.3714396408422754, + "learning_rate": 0.00017107750829420176, + "loss": 0.7154, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.5093555559695603, + "learning_rate": 0.00017034193496547902, + "loss": 0.9635, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4824892609154267, + "learning_rate": 0.00016959874968102735, + "loss": 0.8266, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.403166248117673, + "learning_rate": 0.00016884803286362, + "loss": 0.7489, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.45690137982899215, + "learning_rate": 0.00016808986575104465, + "loss": 0.8993, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.42481386891603745, + "learning_rate": 0.00016732433038731242, + "loss": 0.8961, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4132022855225946, + "learning_rate": 0.0001665515096137797, + "loss": 0.8235, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4604603897430795, + "learning_rate": 0.00016577148706018328, + "loss": 0.918, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3716709937924135, + "learning_rate": 0.00016498434713559088, + "loss": 0.7695, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4322074676635217, + "learning_rate": 0.00016419017501926656, + "loss": 0.7946, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.347234456908987, + "learning_rate": 0.0001633890566514535, + "loss": 0.7293, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4656021839883055, + "learning_rate": 0.00016258107872407375, + "loss": 0.7828, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3913013286778377, + "learning_rate": 0.0001617663286713474, + "loss": 0.7493, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.44779808523000536, + "learning_rate": 0.00016094489466033043, + "loss": 0.8271, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4013665784290249, + "learning_rate": 0.00016011686558137448, + "loss": 0.852, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.4519606431932703, + "learning_rate": 0.0001592823310385073, + "loss": 0.7996, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3751892895358891, + "learning_rate": 0.0001584413813397364, + "loss": 0.7275, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.41081716737905427, + "learning_rate": 0.00015759410748727662, + "loss": 0.7725, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3711590528636452, + "learning_rate": 0.00015674060116770236, + "loss": 0.7327, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.381588644256682, + "learning_rate": 0.00015588095474202595, + "loss": 0.7688, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.3931817700311297, + "learning_rate": 0.00015501526123570277, + "loss": 0.7852, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.4089801949825822, + "learning_rate": 0.00015414361432856475, + "loss": 0.7861, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4232827654924218, + "learning_rate": 0.0001532661083446829, + "loss": 0.8403, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.46829156018564105, + "learning_rate": 0.00015238283824216015, + "loss": 0.842, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.44314902877740564, + "learning_rate": 0.00015149389960285558, + "loss": 0.8634, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.393032587937408, + "learning_rate": 0.00015059938862204127, + "loss": 0.7417, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.42764416554166595, + "learning_rate": 0.00014969940209799248, + "loss": 0.8094, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.4312513961612947, + "learning_rate": 0.00014879403742151283, + "loss": 0.8231, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4070194488889572, + "learning_rate": 0.00014788339256539544, + "loss": 0.7681, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4091466776508617, + "learning_rate": 0.0001469675660738206, + "loss": 0.7416, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.44296615018649654, + "learning_rate": 0.00014604665705169237, + "loss": 0.8017, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.4477265021064643, + "learning_rate": 0.00014512076515391375, + "loss": 0.7836, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.41319881703951783, + "learning_rate": 0.00014418999057460276, + "loss": 0.7921, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4005940267984183, + "learning_rate": 0.0001432544340362501, + "loss": 0.7526, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.45608955737367257, + "learning_rate": 0.00014231419677881966, + "loss": 0.8106, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.5196496738602516, + "learning_rate": 0.00014136938054879283, + "loss": 0.9165, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.46502595544402453, + "learning_rate": 0.00014042008758815818, + "loss": 0.8339, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3866161941995701, + "learning_rate": 0.00013946642062334766, + "loss": 0.6711, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.4300397374744516, + "learning_rate": 0.00013850848285411994, + "loss": 0.7829, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4057684510680121, + "learning_rate": 0.000137546377942393, + "loss": 0.8033, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.4934930216202543, + "learning_rate": 0.00013658021000102636, + "loss": 0.8336, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.43660332196391444, + "learning_rate": 0.00013561008358255468, + "loss": 0.7737, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.36982868459745566, + "learning_rate": 0.00013463610366787392, + "loss": 0.7694, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.37518828351585176, + "learning_rate": 0.00013365837565488064, + "loss": 0.6903, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3892476050497531, + "learning_rate": 0.0001326770053470668, + "loss": 0.7398, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3880862155492406, + "learning_rate": 0.0001316920989420703, + "loss": 0.7614, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.4342991834762453, + "learning_rate": 0.00013070376302018287, + "loss": 0.835, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4487972214877266, + "learning_rate": 0.00012971210453281674, + "loss": 0.7671, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.47768467731735426, + "learning_rate": 0.000128717230790931, + "loss": 0.8454, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3785506493204796, + "learning_rate": 0.00012771924945341906, + "loss": 0.782, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.42543371400212765, + "learning_rate": 0.00012671826851545851, + "loss": 0.8441, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.39001496502671096, + "learning_rate": 0.0001257143962968246, + "loss": 0.7936, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.37317606108854123, + "learning_rate": 0.00012470774143016853, + "loss": 0.6767, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.38597381193367974, + "learning_rate": 0.00012369841284926188, + "loss": 0.8163, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.40269623422546613, + "learning_rate": 0.00012268651977720866, + "loss": 0.764, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.47597420155265463, + "learning_rate": 0.00012167217171462566, + "loss": 0.8383, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3928737482785004, + "learning_rate": 0.0001206554784277931, + "loss": 0.7357, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.4319293741755596, + "learning_rate": 0.00011963654993677645, + "loss": 0.8176, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.4294627464439455, + "learning_rate": 0.00011861549650352069, + "loss": 0.7461, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.4162533969938797, + "learning_rate": 0.00011759242861991855, + "loss": 0.7383, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.4198711385889494, + "learning_rate": 0.00011656745699585371, + "loss": 0.8186, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3922488342536766, + "learning_rate": 0.00011554069254722051, + "loss": 0.7723, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.5409625548162935, + "learning_rate": 0.00011451224638392129, + "loss": 0.7735, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3910548432980919, + "learning_rate": 0.00011348222979784289, + "loss": 0.7857, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.39252399199003585, + "learning_rate": 0.00011245075425081328, + "loss": 0.7596, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.40428584910180754, + "learning_rate": 0.00011141793136253986, + "loss": 0.834, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.47403461784272943, + "learning_rate": 0.0001103838728985307, + "loss": 0.8278, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4095529823613936, + "learning_rate": 0.000109348690758, + "loss": 0.8558, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3740305342667031, + "learning_rate": 0.00010831249696175918, + "loss": 0.7269, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.43701721221728995, + "learning_rate": 0.0001072754036400944, + "loss": 0.8677, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.4133393393713412, + "learning_rate": 0.00010623752302063283, + "loss": 0.8107, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.4050403831582587, + "learning_rate": 0.00010519896741619803, + "loss": 0.809, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.38270730744357806, + "learning_rate": 0.00010415984921265609, + "loss": 0.8066, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.4818734887838035, + "learning_rate": 0.00010312028085675391, + "loss": 0.6843, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.4346256774851159, + "learning_rate": 0.00010208037484395114, + "loss": 0.7645, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.40263681194196066, + "learning_rate": 0.00010104024370624644, + "loss": 0.7637, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.4409310065438654, + "learning_rate": 0.0001, + "loss": 0.8307, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4065733914815933, + "learning_rate": 9.895975629375359e-05, + "loss": 0.8029, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.4683498195746497, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7993, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.42273868753437777, + "learning_rate": 9.687971914324607e-05, + "loss": 0.8318, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.40803389605864315, + "learning_rate": 9.584015078734395e-05, + "loss": 0.7709, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3764654046149156, + "learning_rate": 9.480103258380198e-05, + "loss": 0.6865, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3932258557712938, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7805, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.5124157391438808, + "learning_rate": 9.272459635990562e-05, + "loss": 0.8974, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3674900641039262, + "learning_rate": 9.168750303824084e-05, + "loss": 0.8064, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.3568883570150442, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7544, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.4161693538916811, + "learning_rate": 8.961612710146934e-05, + "loss": 0.8103, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.39933399376075757, + "learning_rate": 8.858206863746018e-05, + "loss": 0.804, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.3771684556048613, + "learning_rate": 8.754924574918675e-05, + "loss": 0.7555, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3666071873765809, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7289, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.46275769563391067, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7583, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4162483714676823, + "learning_rate": 8.445930745277953e-05, + "loss": 0.787, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3496408616474532, + "learning_rate": 8.343254300414628e-05, + "loss": 0.682, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.36043905445513763, + "learning_rate": 8.240757138008149e-05, + "loss": 0.7286, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.36715876356985133, + "learning_rate": 8.138450349647936e-05, + "loss": 0.6988, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.4076632877821367, + "learning_rate": 8.036345006322359e-05, + "loss": 0.8145, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.40920620759987913, + "learning_rate": 7.934452157220694e-05, + "loss": 0.7643, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3995786333038116, + "learning_rate": 7.832782828537437e-05, + "loss": 0.8538, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4032318823087174, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7301, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4488703475183601, + "learning_rate": 7.630158715073813e-05, + "loss": 0.7888, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.41137365079283433, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7858, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3663844906011778, + "learning_rate": 7.428560370317542e-05, + "loss": 0.703, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.4163433401795639, + "learning_rate": 7.328173148454151e-05, + "loss": 0.8451, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4206809795809301, + "learning_rate": 7.228075054658096e-05, + "loss": 0.6838, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3808816377292912, + "learning_rate": 7.1282769209069e-05, + "loss": 0.715, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.3883832414891323, + "learning_rate": 7.028789546718326e-05, + "loss": 0.7572, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4401038202378752, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7141, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3544499562967409, + "learning_rate": 6.830790105792973e-05, + "loss": 0.7231, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3822226329255116, + "learning_rate": 6.732299465293322e-05, + "loss": 0.727, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4116384972210298, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7449, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.3711307535071894, + "learning_rate": 6.536389633212609e-05, + "loss": 0.7257, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.43555632358230745, + "learning_rate": 6.43899164174453e-05, + "loss": 0.8119, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3806265680783455, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7372, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.41831115759962684, + "learning_rate": 6.245362205760704e-05, + "loss": 0.7801, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4192516818649323, + "learning_rate": 6.149151714588009e-05, + "loss": 0.7815, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.38057749267071994, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7326, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3995482231473873, + "learning_rate": 5.957991241184184e-05, + "loss": 0.8029, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.40222742470520273, + "learning_rate": 5.863061945120719e-05, + "loss": 0.7153, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3911062634474729, + "learning_rate": 5.768580322118034e-05, + "loss": 0.7822, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.4408457326765675, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.8023, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.4427495378097438, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.7527, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.38979992497886135, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7564, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.474115233171429, + "learning_rate": 5.395334294830765e-05, + "loss": 0.828, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.42652280091447237, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.8272, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3831482219398793, + "learning_rate": 5.211660743460458e-05, + "loss": 0.7267, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.40887318216030866, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7975, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.4244305440090109, + "learning_rate": 5.030059790200756e-05, + "loss": 0.7442, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.32782090126215163, + "learning_rate": 4.940061137795876e-05, + "loss": 0.6522, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.40689323278394435, + "learning_rate": 4.850610039714444e-05, + "loss": 0.7416, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3877856029005349, + "learning_rate": 4.761716175783989e-05, + "loss": 0.7233, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.37916951385372083, + "learning_rate": 4.673389165531714e-05, + "loss": 0.722, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.4115443896644859, + "learning_rate": 4.585638567143529e-05, + "loss": 0.7576, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.35570562940457406, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6883, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.38880682946216655, + "learning_rate": 4.411904525797408e-05, + "loss": 0.7366, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3242374380482661, + "learning_rate": 4.325939883229766e-05, + "loss": 0.6939, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.4187597997404342, + "learning_rate": 4.240589251272342e-05, + "loss": 0.8191, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.38128869610042276, + "learning_rate": 4.155861866026364e-05, + "loss": 0.7826, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3872980257786108, + "learning_rate": 4.071766896149273e-05, + "loss": 0.7959, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.40387147344548646, + "learning_rate": 3.988313441862553e-05, + "loss": 0.7875, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5076024361108772, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.7878, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.38212416356619255, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7192, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3898006815497741, + "learning_rate": 3.741892127592625e-05, + "loss": 0.7523, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3820374478482116, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7478, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3717263102767262, + "learning_rate": 3.580982498073344e-05, + "loss": 0.754, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.38533746629000326, + "learning_rate": 3.501565286440914e-05, + "loss": 0.6413, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.40032373490306794, + "learning_rate": 3.422851293981676e-05, + "loss": 0.7674, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.38711121341634513, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.742, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.36975214377693705, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.759, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.36556697393084603, + "learning_rate": 3.191013424895536e-05, + "loss": 0.8053, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.41472056125524964, + "learning_rate": 3.115196713638e-05, + "loss": 0.7582, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.38558088058314577, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7081, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.38181110254932604, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7387, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3871373476587633, + "learning_rate": 2.892249170579826e-05, + "loss": 0.7117, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.4000363169658809, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.7517, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.41901585525242724, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.8171, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.3647218821966735, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7237, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3807125615509075, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.7122, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.4052240024383003, + "learning_rate": 2.536156105513062e-05, + "loss": 0.7436, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3531418166157859, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6858, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.389237017070924, + "learning_rate": 2.399319354583418e-05, + "loss": 0.7426, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.4150948200506419, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.7381, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4118373950760467, + "learning_rate": 2.265772503450122e-05, + "loss": 0.733, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3870652323340908, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.7386, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.4071335392815865, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.8066, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3549101773169867, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.7472, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.40988638302403924, + "learning_rate": 2.008778270707944e-05, + "loss": 0.7379, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3732135073629364, + "learning_rate": 1.946674459180955e-05, + "loss": 0.7551, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3892117922435638, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.7199, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3948588558544994, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7683, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4167929485162073, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.7646, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.40307942016131704, + "learning_rate": 1.707039794428259e-05, + "loss": 0.7825, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3861750378862546, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7938, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3674476689858612, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.715, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.353536207299889, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7537, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3566695168378345, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.7089, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.3866955306554242, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.7642, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3451834650287931, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.718, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3579318858260466, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.7603, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.37994279223210864, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.7226, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.353484025350768, + "learning_rate": 1.220944973160133e-05, + "loss": 0.6795, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.31725525771653623, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6401, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3760258159071039, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.722, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.4008131668307014, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.7293, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.37877425308055057, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.7695, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.398401979959032, + "learning_rate": 9.838733725597615e-06, + "loss": 0.7923, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.41517320547877595, + "learning_rate": 9.393660536564408e-06, + "loss": 0.8367, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.35876230173175805, + "learning_rate": 8.958392187916841e-06, + "loss": 0.6976, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.4224162188707925, + "learning_rate": 8.532975781620512e-06, + "loss": 0.7366, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.39780170702610357, + "learning_rate": 8.117457353526625e-06, + "loss": 0.7286, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3897116373056276, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7202, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.3677076713046321, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.7237, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.38206776024573436, + "learning_rate": 6.930734201451816e-06, + "loss": 0.7572, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.43436169752208104, + "learning_rate": 6.555246550469907e-06, + "loss": 0.728, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.35030409330791934, + "learning_rate": 6.189870894938587e-06, + "loss": 0.7104, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.40894521285668217, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7701, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.3864439992817407, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6639, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3730212277592263, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7183, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.4085850406480258, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7971, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3871206930534331, + "learning_rate": 4.516017865659949e-06, + "loss": 0.7049, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3415173703088719, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6897, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.39925143692468934, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7767, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3494117404036923, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.679, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.40968356363514646, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.7508, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3778334267143803, + "learning_rate": 3.100425628282899e-06, + "loss": 0.7164, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3913056347351327, + "learning_rate": 2.848647830172024e-06, + "loss": 0.7484, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.35751729016448386, + "learning_rate": 2.607383131993424e-06, + "loss": 0.7302, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.36765973716443234, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7465, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3593855102348625, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6448, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.380756537904924, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7396, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.4712559845008022, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.7756, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.39606227316523035, + "learning_rate": 1.559629951665298e-06, + "loss": 0.724, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.39405843824990977, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.74, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3549748724309065, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6828, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.7013754933226755, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6775, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3634982384892156, + "learning_rate": 9.130206350089765e-07, + "loss": 0.7441, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.32584430290795596, + "learning_rate": 7.781338686584927e-07, + "loss": 0.7017, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3868531691290023, + "learning_rate": 6.539842600603918e-07, + "loss": 0.6881, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.442008696929735, + "learning_rate": 5.405852438937764e-07, + "loss": 0.729, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.4836393444557512, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.7841, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3718865495552428, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.7214, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.4176425641377951, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.7607, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.42030547898748094, + "learning_rate": 1.947230525005006e-07, + "loss": 0.7717, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.4268942071220283, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.7511, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.402494449397538, + "learning_rate": 8.655918982689581e-08, + "loss": 0.7278, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.3797258712905008, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7164, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.38113282174469687, + "learning_rate": 2.164213936770576e-08, + "loss": 0.7225, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3733461808316852, + "learning_rate": 5.410681219286673e-09, + "loss": 0.7236, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.39801701198883677, + "learning_rate": 0.0, + "loss": 0.699, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 282923046273024.0, + "train_loss": 0.7901752627430818, + "train_runtime": 4907.5347, + "train_samples_per_second": 1.019, + "train_steps_per_second": 0.064 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 282923046273024.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}